# #red wines
# Red_wine <- read.csv("wineQualityReds.csv", header=TRUE, sep = ",")
# Red_wine$Type <- 'Red'
#
# #white wines
# White_wine <- read.csv("wineQualityWhites.csv", header=TRUE, sep = ",")
# White_wine$Type <- 'White'
#
# ## consolidated
# Data <- rbind(Red_wine,White_wine)
# drops <- c("X")
# Data <- Data[ , !(names(Data) %in% drops)]
# Data
#
# ##create final DF
# # write.csv(Data,"/Users/colinobrien/Desktop/repo/stats_6021/Stats_project_group_6/Data.csv", row.names = FALSE)
# # write.csv(Data,"/Users/colinobrien/Desktop/repo/stats_6021/Stats_project_group_6/Data", row.names = FALSE)
# ## both the Data and Data csv are the same. I know people prefer one format vs the other so I made both
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
[30m── [1mAttaching packages[22m ────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──[39m
[30m[32m✓[30m [34mggplot2[30m 3.3.5 [32m✓[30m [34mpurrr [30m 0.3.4
[32m✓[30m [34mtibble [30m 3.1.2 [32m✓[30m [34mdplyr [30m 1.0.7
[32m✓[30m [34mtidyr [30m 1.1.3 [32m✓[30m [34mstringr[30m 1.4.0
[32m✓[30m [34mreadr [30m 1.4.0 [32m✓[30m [34mforcats[30m 0.5.1[39m
[30m── [1mConflicts[22m ───────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31mx[30m [34mdplyr[30m::[32mfilter()[30m masks [34mstats[30m::filter()
[31mx[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()[39m
# library(ROCR)
library(faraway)
library(dplyr)
library(ggplot2)
library(reshape2)
Attaching package: ‘reshape2’
The following object is masked from ‘package:tidyr’:
smiths
library(leaps)
# install.packages("bestglm")
library(bestglm)
# install.packages("performance")
# library(performance)
knitr::opts_chunk$set(echo = TRUE)
## Load Datasets
full_wines_final <- read.csv("Data_Final.csv", header = TRUE, stringsAsFactors=TRUE)
# Drop quality for simplicity
full_wines_binary_with_qual<-full_wines_final
full_wines_binary <- subset(full_wines_final, select = -c(quality))
## Convert to 0 and 1 for readability
full_wines_binary$cat_quality <- as.integer(full_wines_binary$cat_quality == "High")
set.seed(90210) ##for reproducibility
sample<-sample.int(nrow(full_wines_binary), floor(.80*nrow(full_wines_binary)), replace = F)
train<-full_wines_binary[sample, ] ##training data frame
rownames(train) <- c(1:5197)
test<-full_wines_binary[-sample, ] ##test data frame
## Just for a single boxplot
train_with_qual<-full_wines_binary_with_qual[sample,]
test_with_qual<-full_wines_binary_with_qual[-sample,]
train
# drops_cats <- c("Type")
# No_cat_train <- train[ , !(names(train) %in% drops_cats)]
# # No_Type
pairs(train, lower.panel = NULL)
# Convert Type to binary to 0 and 1 for correlation
train$Type <- as.integer(train$Type == "White")
test$Type <- as.integer(test$Type == "White")
cor_train <- cor(train)
cor_train
fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
fixed.acidity 1.00000000 0.21475470 0.3281717780 -0.11435784 0.30907486
volatile.acidity 0.21475470 1.00000000 -0.3756083338 -0.19711519 0.38572698
citric.acid 0.32817178 -0.37560833 1.0000000000 0.14135328 0.03681626
residual.sugar -0.11435784 -0.19711519 0.1413532761 1.00000000 -0.13267263
chlorides 0.30907486 0.38572698 0.0368162603 -0.13267263 1.00000000
free.sulfur.dioxide -0.28545891 -0.36296634 0.1459151441 0.40674512 -0.20583891
total.sulfur.dioxide -0.32829667 -0.42380894 0.2063508767 0.49459459 -0.29216595
density 0.46102677 0.27340816 0.0934734894 0.54686423 0.36564000
pH -0.25053297 0.26430327 -0.3266522997 -0.26561668 0.04191555
sulphates 0.31025439 0.23565770 0.0572809414 -0.18182649 0.40162365
alcohol -0.09179407 -0.03401873 -0.0006471143 -0.34842765 -0.25617404
Type -0.48713134 -0.65599571 0.1886904967 0.35126769 -0.52157700
cat_quality -0.07066799 -0.26538562 0.0778223488 -0.02120537 -0.18458731
free.sulfur.dioxide total.sulfur.dioxide density pH sulphates
fixed.acidity -0.28545891 -0.32829667 0.46102677 -0.25053297 0.31025439
volatile.acidity -0.36296634 -0.42380894 0.27340816 0.26430327 0.23565770
citric.acid 0.14591514 0.20635088 0.09347349 -0.32665230 0.05728094
residual.sugar 0.40674512 0.49459459 0.54686423 -0.26561668 -0.18182649
chlorides -0.20583891 -0.29216595 0.36564000 0.04191555 0.40162365
free.sulfur.dioxide 1.00000000 0.71557904 0.01712365 -0.15788076 -0.19860411
total.sulfur.dioxide 0.71557904 1.00000000 0.02373810 -0.24648617 -0.27752789
density 0.01712365 0.02373810 1.00000000 0.01687243 0.27270901
pH -0.15788076 -0.24648617 0.01687243 1.00000000 0.18260830
sulphates -0.19860411 -0.27752789 0.27270901 0.18260830 1.00000000
alcohol -0.17708863 -0.26082628 -0.67847127 0.11700658 -0.01360657
Type 0.48355395 0.70618430 -0.39538819 -0.33097301 -0.49215645
cat_quality 0.04477089 -0.04308093 -0.26437860 0.01854094 0.03195034
alcohol Type cat_quality
fixed.acidity -0.0917940736 -0.48713134 -0.07066799
volatile.acidity -0.0340187335 -0.65599571 -0.26538562
citric.acid -0.0006471143 0.18869050 0.07782235
residual.sugar -0.3484276488 0.35126769 -0.02120537
chlorides -0.2561740351 -0.52157700 -0.18458731
free.sulfur.dioxide -0.1770886318 0.48355395 0.04477089
total.sulfur.dioxide -0.2608262814 0.70618430 -0.04308093
density -0.6784712690 -0.39538819 -0.26437860
pH 0.1170065815 -0.33097301 0.01854094
sulphates -0.0136065691 -0.49215645 0.03195034
alcohol 1.0000000000 0.03945167 0.39668183
Type 0.0394516700 1.00000000 0.12361294
cat_quality 0.3966818275 0.12361294 1.00000000
T_F_cor <- abs(cor_train)>.7
T_F_cor
fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
fixed.acidity TRUE FALSE FALSE FALSE FALSE
volatile.acidity FALSE TRUE FALSE FALSE FALSE
citric.acid FALSE FALSE TRUE FALSE FALSE
residual.sugar FALSE FALSE FALSE TRUE FALSE
chlorides FALSE FALSE FALSE FALSE TRUE
free.sulfur.dioxide FALSE FALSE FALSE FALSE FALSE
total.sulfur.dioxide FALSE FALSE FALSE FALSE FALSE
density FALSE FALSE FALSE FALSE FALSE
pH FALSE FALSE FALSE FALSE FALSE
sulphates FALSE FALSE FALSE FALSE FALSE
alcohol FALSE FALSE FALSE FALSE FALSE
Type FALSE FALSE FALSE FALSE FALSE
cat_quality FALSE FALSE FALSE FALSE FALSE
free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol Type
fixed.acidity FALSE FALSE FALSE FALSE FALSE FALSE FALSE
volatile.acidity FALSE FALSE FALSE FALSE FALSE FALSE FALSE
citric.acid FALSE FALSE FALSE FALSE FALSE FALSE FALSE
residual.sugar FALSE FALSE FALSE FALSE FALSE FALSE FALSE
chlorides FALSE FALSE FALSE FALSE FALSE FALSE FALSE
free.sulfur.dioxide TRUE TRUE FALSE FALSE FALSE FALSE FALSE
total.sulfur.dioxide TRUE TRUE FALSE FALSE FALSE FALSE TRUE
density FALSE FALSE TRUE FALSE FALSE FALSE FALSE
pH FALSE FALSE FALSE TRUE FALSE FALSE FALSE
sulphates FALSE FALSE FALSE FALSE TRUE FALSE FALSE
alcohol FALSE FALSE FALSE FALSE FALSE TRUE FALSE
Type FALSE TRUE FALSE FALSE FALSE FALSE TRUE
cat_quality FALSE FALSE FALSE FALSE FALSE FALSE FALSE
cat_quality
fixed.acidity FALSE
volatile.acidity FALSE
citric.acid FALSE
residual.sugar FALSE
chlorides FALSE
free.sulfur.dioxide FALSE
total.sulfur.dioxide FALSE
density FALSE
pH FALSE
sulphates FALSE
alcohol FALSE
Type FALSE
cat_quality TRUE
## create melted
melted_cor_train <- melt(cor_train)
##create heat map Consolidated
ggplot(data = melted_cor_train, aes(x=Var1, y=Var2, fill=value)) +
geom_tile(color = "white")+
scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limit = c(-1,1), space = "Lab", name="Pearson\nCorrelation") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1))+
coord_fixed()+
labs(title = 'Consolidated (Both Red and White)')
NA
NA
NA
NA
## creating red and white
train_White <- filter(train, Type == 1)
train_Red <- filter(train, Type == 0)
## droping red/white columns
train_White_NoType <- subset(train_White, select = -c(Type))
train_Red_NoType <- subset(train_Red, select = -c(Type))
## creating correlations
cor_train_White_NoType <- cor(train_White_NoType)
cor_train_Red_NoType <- cor(train_Red_NoType)
## melting
melted_cor_train_white <- melt(cor_train_White_NoType)
melted_cor_train_Red <- melt(cor_train_Red_NoType)
##ploting
##create heat map White
ggplot(data = melted_cor_train_white, aes(x=Var1, y=Var2, fill=value)) +
geom_tile(color = "white")+
scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limit = c(-1,1), space = "Lab", name="Pearson\nCorrelation") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1))+
coord_fixed()+
labs(title = 'White Wine')
##create heat map Red
ggplot(data = melted_cor_train_Red, aes(x=Var1, y=Var2, fill=value)) +
geom_tile(color = "white")+
scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limit = c(-1,1), space = "Lab", name="Pearson\nCorrelation") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1))+
coord_fixed()+
labs(title = 'Red Wine')
ggplot(data = train, mapping = aes(x=Type)) +
geom_bar()
## press formula (from class)
get_press <- function(model) {
sum(((model$residuals)/ (1- (lm.influence(model)$hat)))^2)
}
## first go
full<-glm(cat_quality~., family=binomial, data=train)
summary(full)
Call:
glm(formula = cat_quality ~ ., family = binomial, data = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.4120 -0.8919 0.4303 0.8148 2.6198
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.102e+02 4.889e+01 2.253 0.0242 *
fixed.acidity 7.676e-02 5.573e-02 1.377 0.1684
volatile.acidity -4.720e+00 3.291e-01 -14.342 < 2e-16 ***
citric.acid -4.652e-01 2.875e-01 -1.618 0.1057
residual.sugar 1.187e-01 2.104e-02 5.641 1.69e-08 ***
chlorides -1.265e+00 1.195e+00 -1.059 0.2895
free.sulfur.dioxide 1.345e-02 2.878e-03 4.672 2.98e-06 ***
total.sulfur.dioxide -5.753e-03 1.168e-03 -4.924 8.48e-07 ***
density -1.211e+02 4.964e+01 -2.440 0.0147 *
pH 7.375e-01 3.316e-01 2.224 0.0261 *
sulphates 2.096e+00 2.977e-01 7.042 1.89e-12 ***
alcohol 8.500e-01 6.635e-02 12.811 < 2e-16 ***
Type -5.562e-01 2.077e-01 -2.678 0.0074 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 6833.2 on 5196 degrees of freedom
Residual deviance: 5335.5 on 5184 degrees of freedom
AIC: 5361.5
Number of Fisher Scoring iterations: 5
## removed all insignificant
reduced_1<-glm(formula = cat_quality~volatile.acidity+residual.sugar+free.sulfur.dioxide+total.sulfur.dioxide+density+pH+sulphates+alcohol+Type, family=binomial, data=train)
summary(reduced_1)
Call:
glm(formula = cat_quality ~ volatile.acidity + residual.sugar +
free.sulfur.dioxide + total.sulfur.dioxide + density + pH +
sulphates + alcohol + Type, family = binomial, data = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.3471 -0.9020 0.4275 0.8196 2.6625
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 76.416878 31.773373 2.405 0.01617 *
volatile.acidity -4.680054 0.305603 -15.314 < 2e-16 ***
residual.sugar 0.105875 0.014724 7.191 6.44e-13 ***
free.sulfur.dioxide 0.013183 0.002863 4.604 4.15e-06 ***
total.sulfur.dioxide -0.005930 0.001162 -5.101 3.38e-07 ***
density -86.733244 31.675484 -2.738 0.00618 **
pH 0.595611 0.228953 2.601 0.00928 **
sulphates 1.930494 0.285976 6.751 1.47e-11 ***
alcohol 0.894713 0.051370 17.417 < 2e-16 ***
Type -0.523798 0.204855 -2.557 0.01056 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 6833.2 on 5196 degrees of freedom
Residual deviance: 5341.5 on 5187 degrees of freedom
AIC: 5361.5
Number of Fisher Scoring iterations: 5
##evaluating model
Reduced1_AIC_train <- reduced_1$aic
##predicted quality for test data based on training data
preds<-predict(reduced_1,newdata=test, type="response")
reduced_1_error <- table(test$cat_quality, preds>0.5)
reduced_1_error
FALSE TRUE
0 240 236
1 121 703
evulation_summary <- data.frame(
attempt = 'reduced_1',
AIC = Reduced1_AIC_train,
PRESS = get_press(reduced_1),
'False positive' = round(reduced_1_error[3]/(reduced_1_error[1]+reduced_1_error[3]),3),
'False negative' = round(reduced_1_error[2]/(reduced_1_error[2]+reduced_1_error[4]),3),
'Error Rate' = round((reduced_1_error[2]+reduced_1_error[3])/(reduced_1_error[1]+reduced_1_error[2]+reduced_1_error[3]+reduced_1_error[4]),3)
)
evulation_summary
# install.packages("bestglm")
## Prepare data
train.for.best.logistic <- within(train, {
y <- cat_quality
})
## Reorder variables
train.for.best.logistic <-
train.for.best.logistic[, c("fixed.acidity","volatile.acidity","citric.acid","residual.sugar","total.sulfur.dioxide","density","chlorides","free.sulfur.dioxide",'pH','sulphates','alcohol','Type',"y")]
## Perform
res.best.logistic <-
bestglm(Xy = train.for.best.logistic,
family = binomial, # binomial family for logistic
IC = "AIC", # Information criteria for
method = "exhaustive")
Morgan-Tatar search since family is non-gaussian.
res.best.logistic$BestModels
summary(res.best.logistic$BestModel)
Call:
glm(formula = y ~ ., family = family, data = Xi, weights = weights)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.4302 -0.8932 0.4282 0.8157 2.6350
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.147e+02 4.864e+01 2.358 0.0184 *
fixed.acidity 8.518e-02 5.513e-02 1.545 0.1223
volatile.acidity -4.763e+00 3.268e-01 -14.574 < 2e-16 ***
citric.acid -5.158e-01 2.834e-01 -1.820 0.0688 .
residual.sugar 1.215e-01 2.085e-02 5.829 5.57e-09 ***
total.sulfur.dioxide -5.688e-03 1.167e-03 -4.876 1.08e-06 ***
density -1.261e+02 4.936e+01 -2.554 0.0106 *
free.sulfur.dioxide 1.332e-02 2.874e-03 4.633 3.61e-06 ***
pH 8.012e-01 3.261e-01 2.457 0.0140 *
sulphates 2.027e+00 2.897e-01 6.998 2.59e-12 ***
alcohol 8.553e-01 6.613e-02 12.934 < 2e-16 ***
Type -5.290e-01 2.059e-01 -2.569 0.0102 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 6833.2 on 5196 degrees of freedom
Residual deviance: 5336.7 on 5185 degrees of freedom
AIC: 5360.7
Number of Fisher Scoring iterations: 5
reduced_4 <- res.best.logistic$BestModel
##evaluating model
Reduced4_AIC_train <- reduced_4$aic
##predicted quality for test data based on training data
preds<-predict(reduced_4,newdata=test, type="response")
reduced_4_error <- table(test$cat_quality, preds>0.5)
evulation_summary_4 <- data.frame(
attempt = 'reduced_4_error (all possible)',
AIC = Reduced4_AIC_train,
PRESS = get_press(reduced_4),
'False positive' = round(reduced_4_error[3]/(reduced_4_error[1]+reduced_4_error[3]),3),
'False negative' = round(reduced_4_error[2]/(reduced_4_error[2]+reduced_4_error[4]),3),
'Error Rate' = round((reduced_4_error[2]+reduced_4_error[3])/(reduced_4_error[1]+reduced_4_error[2]+reduced_4_error[3]+reduced_4_error[4]),3)
)
evulation_summary <- rbind(evulation_summary,evulation_summary_4)
# evulation_summary
# data.frame(check_collinearity(reduced_4))
#come back and add df stuff
reduced_4_2<-glm(cat_quality~volatile.acidity+citric.acid+residual.sugar+total.sulfur.dioxide+density+free.sulfur.dioxide+pH+sulphates+alcohol+Type, family=binomial, data=train)
summary(reduced_4_2)
Call:
glm(formula = cat_quality ~ volatile.acidity + citric.acid +
residual.sugar + total.sulfur.dioxide + density + free.sulfur.dioxide +
pH + sulphates + alcohol + Type, family = binomial, data = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.3469 -0.8997 0.4283 0.8144 2.6767
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 58.609496 33.904591 1.729 0.0839 .
volatile.acidity -4.841743 0.323737 -14.956 < 2e-16 ***
citric.acid -0.434875 0.278458 -1.562 0.1184
residual.sugar 0.099352 0.015302 6.493 8.43e-11 ***
total.sulfur.dioxide -0.005815 0.001165 -4.993 5.94e-07 ***
density -68.496442 33.901654 -2.020 0.0433 *
free.sulfur.dioxide 0.013292 0.002871 4.630 3.66e-06 ***
pH 0.465949 0.243355 1.915 0.0555 .
sulphates 1.962718 0.287251 6.833 8.33e-12 ***
alcohol 0.918353 0.053777 17.077 < 2e-16 ***
Type -0.483884 0.206617 -2.342 0.0192 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 6833.2 on 5196 degrees of freedom
Residual deviance: 5339.0 on 5186 degrees of freedom
AIC: 5361
Number of Fisher Scoring iterations: 5
##evaluating model
Reduced4_2_AIC_train <- reduced_4_2$aic
##predicted quality for test data based on training data
preds<-predict(reduced_4_2,newdata=test, type="response")
reduced_4_2_error <- table(test$cat_quality, preds>0.7)
#Curves
evulation_summary_4_2 <- data.frame(
attempt = 'reduced_4_2_error (post VIF adjustments)',
AIC = Reduced4_2_AIC_train,
PRESS = get_press(reduced_4_2),
'False positive' = round(reduced_4_2_error[3]/(reduced_4_2_error[1]+reduced_4_2_error[3]),3),
'False negative' = round(reduced_4_2_error[2]/(reduced_4_2_error[2]+reduced_4_2_error[4]),3),
'Error Rate' = round((reduced_4_2_error[2]+reduced_4_2_error[3])/(reduced_4_2_error[1]+reduced_4_2_error[2]+reduced_4_2_error[3]+reduced_4_2_error[4]),3)
)
evulation_summary <- rbind(evulation_summary,evulation_summary_4_2)
evulation_summary
NA
NA
summary(reduced_4)
Call:
glm(formula = y ~ ., family = family, data = Xi, weights = weights)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.4302 -0.8932 0.4282 0.8157 2.6350
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.147e+02 4.864e+01 2.358 0.0184 *
fixed.acidity 8.518e-02 5.513e-02 1.545 0.1223
volatile.acidity -4.763e+00 3.268e-01 -14.574 < 2e-16 ***
citric.acid -5.158e-01 2.834e-01 -1.820 0.0688 .
residual.sugar 1.215e-01 2.085e-02 5.829 5.57e-09 ***
total.sulfur.dioxide -5.688e-03 1.167e-03 -4.876 1.08e-06 ***
density -1.261e+02 4.936e+01 -2.554 0.0106 *
free.sulfur.dioxide 1.332e-02 2.874e-03 4.633 3.61e-06 ***
pH 8.012e-01 3.261e-01 2.457 0.0140 *
sulphates 2.027e+00 2.897e-01 6.998 2.59e-12 ***
alcohol 8.553e-01 6.613e-02 12.934 < 2e-16 ***
Type -5.290e-01 2.059e-01 -2.569 0.0102 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 6833.2 on 5196 degrees of freedom
Residual deviance: 5336.7 on 5185 degrees of freedom
AIC: 5360.7
Number of Fisher Scoring iterations: 5
p <- 12
n <- 5197
reduced_4_cook <-cooks.distance(reduced_4)
reduced_4_cook[reduced_4_cook>qf(0.5,p,n-p)]
named numeric(0)
##dffits
DFFITS<-dffits(reduced_4)
DDFFITS_influence <- DFFITS[abs(DFFITS)>2*sqrt(p/n)]
DDFFITS_influence
83 115 133 148 149 171 183 202
0.11219627 0.10544184 -0.18411002 -0.09991644 -0.12785114 -0.11374142 0.10867107 -0.16045065
206 225 249 292 309 317 380 455
-0.11840913 0.10093853 -0.09841249 0.09816267 0.10078385 0.10471012 -0.11210579 0.09920619
466 572 722 739 747 794 806 912
-0.11390124 0.10171119 -0.16019151 0.11513457 0.09736192 -0.10449640 -0.10120073 0.15167391
952 957 1010 1042 1055 1114 1139 1180
0.09704330 -0.16243046 0.11892944 -0.09863713 -0.11344570 -0.10497624 0.10300372 -0.18433803
1241 1258 1270 1325 1347 1467 1493 1527
0.10969510 0.09821539 0.10530848 -0.09678052 -0.10103881 0.11642941 -0.11091048 -0.11561143
1528 1597 1620 1658 1693 1776 1846 1873
-0.12352731 -0.09863589 -0.10722071 -0.09731307 -0.09795498 -0.10237941 0.12561590 -0.12529594
1882 1891 2002 2004 2020 2079 2103 2148
-0.10642853 0.10446926 0.10894528 0.20125104 0.09637786 -0.10426201 0.09853506 -0.27921127
2203 2256 2258 2285 2305 2350 2353 2361
-0.10129823 -0.10012511 0.10312207 0.10896369 0.13774801 0.09853162 -0.15221240 -0.11012001
2383 2410 2434 2461 2465 2487 2494 2497
-0.10852123 -0.12636270 0.12603788 -0.15438799 -0.10048416 0.10125735 -0.12151311 0.10319545
2521 2525 2527 2551 2594 2650 2696 2699
-0.10237941 -0.10131647 -0.10242317 -0.09669244 -0.10722071 -0.10178224 -0.11840913 -0.10221624
2772 2781 2801 2840 2852 2892 2948 2965
0.20008361 -0.10703304 -0.11210579 0.16042062 -0.10974330 0.09732574 -0.09841249 -0.09647583
2978 2994 2995 2997 3018 3030 3041 3051
-0.11806742 0.14196649 0.09920619 -0.13745390 0.10099437 -0.14516539 0.16695970 -0.12151311
3065 3079 3126 3155 3188 3274 3282 3330
-0.21978199 0.10160629 0.17165718 0.13048253 0.17504970 -0.10882370 0.10171119 0.09880694
3347 3367 3378 3381 3496 3526 3557 3564
-0.11538312 -0.11374142 -0.10444247 0.10177520 0.12512066 -0.15993715 -0.13072803 -0.13765634
3571 3709 3721 3740 3752 3809 3823 3843
-0.11317090 -0.11344570 -0.15438799 -0.10404207 -0.15579966 -0.10786452 0.88209659 0.10113003
3848 3895 3922 4024 4035 4065 4071 4112
0.11006220 -0.18174866 -0.12028121 -0.10640867 -0.11091048 0.09637786 -0.31948427 -0.12024521
4125 4131 4132 4140 4260 4300 4401 4425
-0.10057784 -0.10147570 0.17165718 -0.11174867 -0.11267738 0.10499625 -0.10880237 -0.10594437
4463 4490 4512 4547 4610 4627 4678 4687
-0.12634536 -0.09991644 -0.09647583 -0.17220454 0.10741187 -0.21859387 -0.10392610 0.12149435
4783 4790 4794 4829 4873 5045 5091 5178
0.12315874 0.10969510 -0.09878535 -0.11104591 -0.16746164 -0.10529190 -0.10703304 0.11264144
5187
0.10894528
DFBETAS<-dfbetas(reduced_4)
abs(DFBETAS)>2/sqrt(n)
(Intercept) fixed.acidity volatile.acidity citric.acid residual.sugar total.sulfur.dioxide
1 FALSE FALSE FALSE FALSE FALSE FALSE
2 FALSE FALSE FALSE FALSE FALSE FALSE
3 FALSE TRUE FALSE TRUE TRUE FALSE
4 FALSE FALSE FALSE FALSE FALSE FALSE
5 FALSE FALSE FALSE FALSE FALSE FALSE
6 FALSE FALSE FALSE FALSE FALSE FALSE
7 FALSE FALSE FALSE FALSE FALSE FALSE
8 FALSE FALSE FALSE FALSE TRUE FALSE
9 FALSE FALSE FALSE FALSE FALSE FALSE
10 TRUE TRUE FALSE TRUE TRUE TRUE
11 FALSE FALSE FALSE FALSE FALSE FALSE
12 FALSE FALSE FALSE FALSE FALSE FALSE
13 FALSE FALSE FALSE FALSE FALSE FALSE
14 FALSE FALSE FALSE FALSE FALSE FALSE
15 FALSE FALSE FALSE FALSE FALSE FALSE
16 FALSE FALSE FALSE FALSE FALSE FALSE
17 FALSE FALSE FALSE FALSE FALSE FALSE
18 FALSE FALSE FALSE FALSE FALSE FALSE
19 FALSE FALSE FALSE FALSE FALSE FALSE
20 FALSE FALSE FALSE FALSE FALSE FALSE
21 FALSE FALSE FALSE TRUE FALSE FALSE
22 FALSE FALSE FALSE FALSE FALSE FALSE
23 FALSE TRUE FALSE FALSE TRUE FALSE
24 FALSE FALSE FALSE FALSE FALSE FALSE
25 FALSE TRUE TRUE FALSE FALSE FALSE
26 FALSE FALSE FALSE FALSE FALSE FALSE
27 FALSE FALSE FALSE FALSE FALSE FALSE
28 FALSE FALSE FALSE FALSE FALSE FALSE
29 FALSE FALSE FALSE FALSE FALSE FALSE
30 FALSE FALSE FALSE FALSE FALSE FALSE
31 FALSE FALSE FALSE FALSE FALSE FALSE
32 FALSE FALSE FALSE FALSE FALSE FALSE
33 FALSE FALSE FALSE FALSE FALSE FALSE
34 FALSE FALSE FALSE FALSE FALSE FALSE
35 FALSE FALSE FALSE FALSE FALSE FALSE
36 FALSE FALSE FALSE FALSE FALSE FALSE
37 FALSE FALSE FALSE FALSE FALSE FALSE
38 FALSE FALSE FALSE FALSE FALSE FALSE
39 FALSE FALSE FALSE FALSE FALSE FALSE
40 FALSE FALSE FALSE FALSE FALSE FALSE
41 FALSE FALSE TRUE FALSE FALSE FALSE
42 FALSE FALSE FALSE FALSE FALSE FALSE
43 FALSE FALSE FALSE FALSE FALSE FALSE
44 FALSE FALSE FALSE FALSE FALSE FALSE
45 FALSE FALSE FALSE FALSE FALSE FALSE
46 FALSE FALSE FALSE FALSE FALSE FALSE
47 FALSE FALSE FALSE FALSE TRUE FALSE
48 FALSE FALSE FALSE TRUE FALSE FALSE
49 FALSE FALSE FALSE FALSE FALSE FALSE
50 FALSE FALSE FALSE FALSE FALSE FALSE
51 FALSE FALSE FALSE FALSE FALSE FALSE
52 FALSE FALSE FALSE FALSE FALSE FALSE
53 FALSE FALSE FALSE FALSE FALSE FALSE
54 FALSE FALSE FALSE FALSE FALSE FALSE
55 FALSE FALSE FALSE FALSE FALSE FALSE
56 FALSE FALSE FALSE FALSE FALSE FALSE
57 FALSE FALSE FALSE FALSE FALSE FALSE
58 FALSE FALSE FALSE FALSE FALSE FALSE
59 FALSE FALSE TRUE TRUE FALSE FALSE
60 FALSE FALSE FALSE FALSE FALSE FALSE
61 FALSE FALSE FALSE FALSE FALSE FALSE
62 FALSE FALSE FALSE FALSE FALSE FALSE
63 FALSE FALSE FALSE FALSE FALSE FALSE
64 FALSE FALSE FALSE FALSE FALSE FALSE
65 TRUE TRUE FALSE FALSE TRUE FALSE
66 FALSE FALSE FALSE FALSE FALSE FALSE
67 FALSE FALSE FALSE FALSE FALSE FALSE
68 FALSE FALSE FALSE FALSE FALSE FALSE
69 FALSE FALSE FALSE FALSE FALSE FALSE
70 FALSE FALSE FALSE FALSE FALSE FALSE
71 FALSE FALSE FALSE FALSE FALSE FALSE
72 FALSE FALSE FALSE FALSE FALSE FALSE
73 FALSE FALSE FALSE FALSE FALSE FALSE
74 FALSE FALSE FALSE FALSE FALSE FALSE
75 FALSE FALSE FALSE FALSE FALSE FALSE
76 FALSE FALSE FALSE FALSE FALSE FALSE
77 FALSE FALSE FALSE FALSE FALSE FALSE
78 FALSE FALSE FALSE FALSE FALSE FALSE
79 FALSE FALSE FALSE FALSE FALSE FALSE
80 FALSE FALSE FALSE FALSE FALSE FALSE
81 FALSE TRUE FALSE FALSE FALSE FALSE
82 FALSE FALSE FALSE FALSE FALSE FALSE
83 FALSE TRUE TRUE TRUE FALSE FALSE
density free.sulfur.dioxide pH sulphates alcohol Type
1 FALSE FALSE FALSE FALSE FALSE FALSE
2 FALSE FALSE FALSE FALSE FALSE FALSE
3 FALSE FALSE TRUE FALSE FALSE FALSE
4 FALSE FALSE FALSE FALSE FALSE FALSE
5 FALSE FALSE FALSE FALSE FALSE FALSE
6 FALSE FALSE FALSE FALSE FALSE TRUE
7 FALSE FALSE FALSE FALSE FALSE FALSE
8 FALSE TRUE FALSE TRUE FALSE FALSE
9 FALSE FALSE FALSE FALSE FALSE FALSE
10 TRUE TRUE FALSE FALSE TRUE TRUE
11 FALSE FALSE FALSE FALSE FALSE FALSE
12 FALSE FALSE FALSE FALSE FALSE FALSE
13 FALSE FALSE FALSE FALSE FALSE FALSE
14 FALSE FALSE FALSE FALSE FALSE FALSE
15 FALSE FALSE FALSE TRUE FALSE FALSE
16 FALSE FALSE FALSE FALSE FALSE FALSE
17 FALSE FALSE FALSE TRUE FALSE FALSE
18 FALSE FALSE TRUE FALSE FALSE FALSE
19 FALSE FALSE FALSE FALSE FALSE FALSE
20 FALSE FALSE FALSE FALSE FALSE FALSE
21 FALSE FALSE FALSE FALSE FALSE FALSE
22 FALSE FALSE FALSE FALSE FALSE FALSE
23 FALSE FALSE FALSE FALSE FALSE FALSE
24 FALSE FALSE FALSE FALSE FALSE FALSE
25 FALSE FALSE FALSE FALSE FALSE FALSE
26 FALSE FALSE FALSE FALSE FALSE FALSE
27 FALSE FALSE FALSE FALSE FALSE FALSE
28 FALSE FALSE FALSE FALSE FALSE FALSE
29 FALSE FALSE FALSE FALSE FALSE FALSE
30 FALSE FALSE FALSE FALSE FALSE FALSE
31 FALSE FALSE FALSE FALSE FALSE FALSE
32 FALSE FALSE FALSE FALSE FALSE FALSE
33 FALSE FALSE FALSE FALSE FALSE FALSE
34 FALSE FALSE FALSE FALSE FALSE FALSE
35 FALSE FALSE FALSE FALSE FALSE FALSE
36 FALSE FALSE FALSE FALSE FALSE FALSE
37 FALSE FALSE FALSE FALSE FALSE FALSE
38 FALSE FALSE FALSE FALSE FALSE FALSE
39 FALSE FALSE FALSE FALSE FALSE FALSE
40 FALSE FALSE FALSE FALSE FALSE FALSE
41 FALSE FALSE FALSE TRUE FALSE TRUE
42 FALSE FALSE FALSE FALSE FALSE FALSE
43 FALSE TRUE FALSE TRUE FALSE FALSE
44 FALSE FALSE FALSE FALSE FALSE FALSE
45 FALSE FALSE FALSE FALSE FALSE FALSE
46 FALSE FALSE FALSE FALSE FALSE FALSE
47 FALSE FALSE TRUE FALSE FALSE FALSE
48 FALSE FALSE FALSE FALSE FALSE FALSE
49 FALSE FALSE FALSE FALSE FALSE FALSE
50 FALSE FALSE FALSE FALSE FALSE FALSE
51 FALSE FALSE FALSE FALSE FALSE FALSE
52 FALSE FALSE FALSE FALSE FALSE FALSE
53 FALSE FALSE FALSE FALSE FALSE FALSE
54 FALSE FALSE FALSE FALSE FALSE FALSE
55 FALSE FALSE FALSE FALSE FALSE FALSE
56 FALSE FALSE FALSE FALSE FALSE FALSE
57 FALSE FALSE FALSE FALSE FALSE FALSE
58 FALSE FALSE FALSE FALSE FALSE FALSE
59 FALSE FALSE FALSE FALSE FALSE TRUE
60 FALSE FALSE FALSE FALSE FALSE FALSE
61 FALSE FALSE TRUE FALSE FALSE FALSE
62 FALSE FALSE FALSE FALSE FALSE FALSE
63 FALSE FALSE FALSE FALSE FALSE FALSE
64 FALSE FALSE FALSE FALSE FALSE FALSE
65 TRUE FALSE TRUE FALSE TRUE FALSE
66 FALSE FALSE FALSE FALSE FALSE FALSE
67 FALSE FALSE FALSE FALSE FALSE FALSE
68 FALSE FALSE FALSE FALSE FALSE FALSE
69 FALSE FALSE FALSE FALSE FALSE FALSE
70 FALSE TRUE FALSE FALSE FALSE FALSE
71 FALSE FALSE FALSE FALSE FALSE FALSE
72 FALSE FALSE FALSE FALSE FALSE FALSE
73 FALSE FALSE FALSE FALSE FALSE FALSE
74 FALSE FALSE FALSE FALSE FALSE FALSE
75 FALSE FALSE FALSE FALSE FALSE FALSE
76 FALSE FALSE FALSE FALSE FALSE FALSE
77 FALSE FALSE FALSE FALSE FALSE FALSE
78 FALSE FALSE FALSE FALSE FALSE FALSE
79 FALSE FALSE FALSE FALSE FALSE FALSE
80 FALSE FALSE FALSE FALSE FALSE FALSE
81 FALSE FALSE TRUE FALSE FALSE FALSE
82 FALSE FALSE FALSE FALSE FALSE FALSE
83 FALSE FALSE TRUE FALSE FALSE TRUE
[ reached getOption("max.print") -- omitted 5114 rows ]
##leverages
lev<-lm.influence(reduced_4)$hat
##identify high leverage points
leverages <- lev[lev>2*p/n]
leverages
25 48 53 65 133 145 171 175
0.004849640 0.007980011 0.007028687 0.005763404 0.022975591 0.005173793 0.005219474 0.004875721
187 202 225 228 248 249 255 282
0.006161515 0.005447858 0.006756723 0.005384983 0.008357256 0.004801962 0.005867337 0.005674515
299 309 332 333 339 345 361 380
0.005484537 0.010250038 0.004754212 0.004686940 0.004990699 0.005924282 0.006389325 0.006435227
383 417 427 455 466 489 516 557
0.006875256 0.004943798 0.004867913 0.009165489 0.006894948 0.007172970 0.010652503 0.005554653
600 607 708 722 730 747 749 765
0.005521417 0.005900775 0.004918820 0.005283197 0.005147385 0.005767084 0.005739296 0.005627857
770 806 828 873 899 912 945 957
0.004853531 0.005540805 0.007828147 0.006995196 0.005410461 0.009841709 0.005463074 0.011974020
965 995 1010 1037 1055 1114 1132 1139
0.005623028 0.014123202 0.004921736 0.006212390 0.006141910 0.005265677 0.006966068 0.005959266
1140 1152 1169 1176 1180 1188 1241 1255
0.006430522 0.006059376 0.004777701 0.005844926 0.012489770 0.004953075 0.004856461 0.004777701
1265 1300 1304 1306 1325 1345 1375 1427
0.005672314 0.006081786 0.005100154 0.005735362 0.005574241 0.004711974 0.006383030 0.004930082
1467 1479 1481 1482 1493 1495 1507 1518
0.006002690 0.006720302 0.005242150 0.014123202 0.007969759 0.004698112 0.004854829 0.005124711
1527 1528 1612 1614 1638 1642 1649 1667
0.005382525 0.005299431 0.009822894 0.008993399 0.007179770 0.007952008 0.005823824 0.007088896
1711 1719 1768 1801 1829 1845 1846 1873
0.004814463 0.004889259 0.004851478 0.004848096 0.004766974 0.006026022 0.005410441 0.005408285
1882 1887 1891 1894 1929 1933 1935 1941
0.004657424 0.005524176 0.005158734 0.004960904 0.007033838 0.006340479 0.005367322 0.005155488
1956 1979 2002 2004 2019 2097 2106 2120
0.004781888 0.006023785 0.005565063 0.013293498 0.005674515 0.004945639 0.005391888 0.005466349
2135 2137 2144 2148 2155 2165 2177 2203
0.006032961 0.005458302 0.005593131 0.020382883 0.005008204 0.005477897 0.005419422 0.004636068
2211 2218 2232 2270 2281 2294 2305 2306
0.004669295 0.019421769 0.006338349 0.007220018 0.006915245 0.006786840 0.010603689 0.005150988
2317 2350 2353 2361 2371 2379 2383 2391
0.011839638 0.004828941 0.012864370 0.005000134 0.005408758 0.005477511 0.006524792 0.005514256
2396 2400 2410 2420 2434 2461 2469 2494
0.008839432 0.007088896 0.007468644 0.004680633 0.020272029 0.016139547 0.005146048 0.008984307
2497 2525 2602 2623 2650 2663 2680 2688
0.006080078 0.006378263 0.007252053 0.006726745 0.005745347 0.006470764 0.007158112 0.005672314
2699 2772 2777 2779 2798 2801 2812 2840
0.007016065 0.013464526 0.012846622 0.006384951 0.004748024 0.006435227 0.007487716 0.008115731
2849 2852 2856 2900 2910 2912 2946 2948
0.006556965 0.004882351 0.006384951 0.005156491 0.006091693 0.004787344 0.005054188 0.004801962
2965 2969 2971 2990 2994 2995 3004 3008
0.005054630 0.005674515 0.004891501 0.005016018 0.005494005 0.009165489 0.004821646 0.005616506
3012 3018 3021 3030 3041 3046 3051 3052
0.005184523 0.007585533 0.007383966 0.008916163 0.016455429 0.005893423 0.008984307 0.005310204
3065 3074 3126 3130 3155 3188 3221 3274
0.012183875 0.004946746 0.019492710 0.005384397 0.007093095 0.015836977 0.005940855 0.005995139
3292 3309 3324 3332 3347 3367 3378 3381
0.004646160 0.005018396 0.005633797 0.008055287 0.007555507 0.005219474 0.005844088 0.007215437
3413 3418 3419 3446 3469 3474 3487 3496
0.007952008 0.004995318 0.005468012 0.004726109 0.005565027 0.004674916 0.005068929 0.009369520
3519 3524 3526 3545 3549 3557 3587 3611
0.005543214 0.004635661 0.013422322 0.004757291 0.004918820 0.007684917 0.004872856 0.004872856
3622 3650 3666 3709 3710 3720 3721 3727
0.004733307 0.005857163 0.006405776 0.006141910 0.005615658 0.006389325 0.016139547 0.006618658
3741 3752 3763 3769 3787 3792 3807 3818
0.004870547 0.016330934 0.005287558 0.004728145 0.004943798 0.007689718 0.005998108 0.010775791
3820 3823 3824 3835 3843 3848 3854 3863
0.007366569 0.323058874 0.012258040 0.004967080 0.005005760 0.006333501 0.007689718 0.008161154
3890 3895 3912 3917 3922 3953 3991 4024
0.005616506 0.019354101 0.004912963 0.007690767 0.010025916 0.019421769 0.004912256 0.005151978
4035 4071 4088 4112 4125 4129 4131 4132
0.007969759 0.033246417 0.004918863 0.007559978 0.005498808 0.004805200 0.007131721 0.019492710
4140 4143 4174 4180 4191 4195 4260 4287
0.009613280 0.004686940 0.006136951 0.005524176 0.005514261 0.005146048 0.009759507 0.004801486
4300 4301 4343 4362 4401 4463 4482 4493
0.005225384 0.005310204 0.007165155 0.005507166 0.005566348 0.017712588 0.008767404 0.007154999
4511 4512 4547 4553 4558 4585 4627 4633
0.006995196 0.005054630 0.009347506 0.010326676 0.006875101 0.004851478 0.014885235 0.005546699
4639 4687 4693 4696 4710 4715 4754 4759
0.004777701 0.006998472 0.004640983 0.005081857 0.005440630 0.005196371 0.006386644 0.007418423
4790 4829 4873 4902 4908 4972 4992 5040
0.004856461 0.006130203 0.012769233 0.005893423 0.008892413 0.004912256 0.006275606 0.004889259
5041 5054 5087 5107 5154 5178 5187
0.007383966 0.006838011 0.004875721 0.006541276 0.007677790 0.006034090 0.005565063
reduced_4.res <- reduced_4$residuals
crit<-qt(1-0.05/(2*n), n-p-1)
outliers <- reduced_4.res[abs(reduced_4.res)>crit]
outliers
8 12 21 30 83 85 86 101
-5.913799 -112.810937 -5.131140 -28.388375 7.064053 -4.902421 5.268800 -6.738462
115 122 142 148 149 154 166 183
5.835978 -11.281066 -4.900743 -5.950818 -8.284466 -4.614577 5.536043 18.759111
202 206 218 245 264 281 288 372
-11.064846 -6.976696 7.906909 -7.166119 6.562536 4.514517 -9.756502 -9.170360
373 440 505 509 525 548 566 589
-4.811386 -4.829656 10.431685 -5.199469 32.192103 -42.498339 10.705696 -4.616565
629 655 657 659 675 687 701 706
-5.057653 4.903313 -10.049213 -4.454203 -5.471326 -6.529182 -9.150344 -8.992792
717 721 722 725 739 751 768 774
-7.819105 -8.206050 -11.839559 -6.756563 4.469453 9.211716 -5.691134 -4.977692
788 794 795 817 819 833 840 844
5.293240 -5.395176 -9.673765 -13.711257 -4.723057 -10.325351 -4.977692 7.896471
875 888 902 903 917 954 964 967
-13.765446 4.511825 -4.486556 4.815690 -9.997107 11.645455 -5.057653 7.233862
991 1039 1077 1081 1107 1124 1133 1156
-4.851860 -29.863426 -6.559467 -5.979445 -11.538933 -9.743607 -8.597773 -358.992053
1162 1167 1171 1172 1226 1235 1257 1258
-5.913799 -13.370172 -5.991466 -13.904782 -24.686598 -22.516690 9.644487 4.622980
1263 1270 1305 1318 1344 1356 1387 1412
6.011786 8.044160 -5.145541 -5.100743 5.658642 -6.254700 -4.851860 -9.750690
1428 1436 1487 1497 1510 1524 1610 1620
-11.498443 -8.316040 7.687631 5.004046 -7.060659 4.482981 -16.343618 -5.238351
1652 1658 1665 1669 1693 1706 1776 1787
-11.418067 -7.852930 -5.163340 -16.484320 -4.689106 -4.877840 -16.318931 -5.613721
1790 1840 1930 1951 1958 1998 2004 2006
9.189242 -4.798258 -6.274087 -7.515991 -6.682392 9.251057 4.599440 5.240035
2018 2040 2041 2059 2079 2115 2119 2148
7.500553 4.734365 -5.264313 -15.336442 -6.043512 -23.388238 -4.859254 -6.605338
2156 2167 2195 2205 2220 2256 2258 2262
-5.287846 -9.600193 -5.857549 -4.579628 4.948918 -9.053372 17.356606 -7.264594
2284 2290 2313 2329 2334 2335 2337 2386
-16.484320 5.684278 -4.851860 -7.041619 -7.417245 -4.555771 -5.816303 -12.954904
2439 2452 2480 2485 2506 2521 2536 2579
9.264406 -8.282663 -4.793832 -18.780311 4.541019 -16.318931 -4.884719 9.211716
2594 2628 2629 2642 2660 2661 2666 2696
-5.238351 -6.381604 4.489202 -5.096416 4.453651 -5.461202 -6.254700 -6.976696
2702 2718 2727 2735 2745 2763 2772 2781
7.233862 -6.726264 5.198906 -6.197960 -6.606402 -11.892705 4.431016 -8.362063
2793 2795 2802 2840 2866 2892 2898 2955
-4.697738 -4.530220 -7.010108 4.976958 -7.349414 11.007314 -4.528817 -14.101843
2978 2994 2997 3005 3025 3040 3042 3065
-13.426497 6.463923 -14.095802 5.659028 -4.737794 -8.881971 -6.738462 -7.313679
3072 3079 3090 3105 3133 3149 3196 3210
-6.423514 16.073667 5.085934 -5.548543 -5.466040 -5.426271 -6.355608 -7.747739
3243 3248 3257 3264 3322 3336 3343 3351
-6.033627 5.580542 -4.595999 -5.857549 -9.308640 9.264406 -4.501913 5.867902
3362 3369 3400 3410 3414 3424 3434 3461
-13.219174 -12.293034 -4.896657 -5.318501 -9.993826 -6.001601 -10.863890 -6.353654
3522 3564 3566 3571 3582 3605 3629 3638
-8.866347 -8.604661 -6.119343 -6.889088 -5.322449 -9.260519 -8.348111 -6.028213
3711 3716 3740 3750 3784 3816 3841 3853
-6.506898 5.779253 -7.586267 5.060560 -4.746179 -6.432015 -6.569981 5.240035
3855 3866 3903 3910 3913 3915 3942 3947
-10.826648 -4.935304 5.739331 -9.056519 -4.882192 -16.323382 -5.403135 -4.485039
3951 4003 4019 4090 4093 4111 4116 4127
-6.092031 -8.881971 -4.877840 -4.542401 -5.913799 -15.362967 -5.909070 -7.854482
4144 4145 4151 4156 4159 4170 4233 4243
4.541019 -8.716350 -9.593344 -7.055209 -42.578664 -4.837952 -4.614577 -5.850980
4248 4306 4311 4329 4354 4356 4460 4490
-14.114484 -5.627580 4.678171 -11.261020 -14.672621 -5.417642 6.046933 -5.950818
4503 4505 4527 4544 4547 4556 4580 4610
-6.716718 4.453651 4.477563 -8.992792 -4.960846 -4.813642 -13.243536 5.353589
4614 4627 4650 4656 4659 4677 4701 4713
-5.833289 -4.965536 -4.896657 4.644296 -6.288973 5.012168 -10.498461 8.507914
4729 4783 4794 4815 4858 4880 4887 4910
-6.726264 7.580155 -5.470226 -5.334876 -5.096416 -6.738462 5.612130 -6.821823
4924 4939 4988 5045 5049 5051 5091 5095
-4.543092 4.488823 -6.821823 -11.196377 -4.972743 -6.197392 -8.362063 -7.809710
5133 5140 5179 5195
-5.399943 4.815690 -8.541941 -10.162346
## outliers removed
outliers_index <- attr(outliers, "names")
outliers_index <- as.numeric(outliers_index)
train_no_outliers <- train[-(outliers_index),]
#leverages removed
lererages_index <- attr(leverages, "names")
lererages_index <- as.numeric(lererages_index)
train_no_leverages <- train[-(lererages_index),]
# DDFFITS_influence
DDFFITS_index <- attr(DDFFITS_influence, "names")
DDFFITS_index <- as.numeric(DDFFITS_index)
train_no_DDFFITS <- train[-(DDFFITS_index),]
# all "non-normal" removed
all_special <- c(DDFFITS_index,lererages_index,outliers_index)
train_nothing_special <- train[-(all_special),]
train_nothing_special
NA
NA
NA
NA
vif(train[c(2,3,4,7,8,6,9,10,11)])
volatile.acidity citric.acid residual.sugar total.sulfur.dioxide
1.818602 1.505238 3.353752 2.850601
density free.sulfur.dioxide pH sulphates
5.664883 2.099196 1.338240 1.427424
alcohol
2.890129
train_temp<-train
# as.factor(train_temp$Type)<-numeric(train_temp$Type)
#train_temp
# as.factor
train_temp$Type <- as.numeric(train_temp$Type)-1
train_temp$Type <- as.integer(train_temp$Type)
train_temp
reduced_4_3 <- glm(cat_quality~volatile.acidity+citric.acid+residual.sugar+total.sulfur.dioxide+density+free.sulfur.dioxide+pH+sulphates+alcohol+Type, family=binomial, data=train_no_outliers)
reduced_4_4_lev <- glm(cat_quality~volatile.acidity+citric.acid+residual.sugar+total.sulfur.dioxide+density+free.sulfur.dioxide+pH+sulphates+alcohol+Type, family=binomial, data=train_no_leverages)
reduced_4_5_DDFFITS <- glm(cat_quality~volatile.acidity+citric.acid+residual.sugar+total.sulfur.dioxide+density+free.sulfur.dioxide+pH+sulphates+alcohol+Type, family=binomial, data=train_no_DDFFITS)
reduced_4_6_no_special <- glm(cat_quality~volatile.acidity+citric.acid+residual.sugar+total.sulfur.dioxide+density+free.sulfur.dioxide+pH+sulphates+alcohol+Type, family=binomial, data=train_nothing_special)
# summary(reduced_4_6_no_special)
## checking colinearity / VIF scores
# reduced_4_3_col <- data.frame('reduced_4_3' = check_collinearity(reduced_4_3))
# reduced_4_3_col_VIF <- reduced_4_3_col[c('reduced_4_3.Term','reduced_4_3.VIF')]
# reduced_4_3_col_VIF
#
# reduced_4_4_lev_col <- data.frame('reduced_4_4_lev' = check_collinearity(reduced_4_4_lev))
# reduced_4_4_lev_col_VIF <- reduced_4_4_lev_col[c('reduced_4_4_lev.Term','reduced_4_4_lev.VIF')]
# reduced_4_4_lev_col_VIF
#
#
# reduced_4_5_DDFFITS_col <- data.frame('reduced_4_5_DDFFITS' = check_collinearity(reduced_4_5_DDFFITS))
# reduced_4_5_DDFFITS_col_VIF <- reduced_4_5_DDFFITS_col[c('reduced_4_5_DDFFITS.Term','reduced_4_5_DDFFITS.VIF')]
# reduced_4_5_DDFFITS_col_VIF
#
# reduced_4_6_no_special_col <- data.frame('reduced_4_6_no_special' = check_collinearity(reduced_4_6_no_special))
# reduced_4_6_no_special_col_VIF <- reduced_4_6_no_special_col[c('reduced_4_6_no_special.Term','reduced_4_6_no_special.VIF')]
# reduced_4_6_no_special_col_VIF
#
# VIF_summary <- data.frame('0'=reduced_4_3_col_VIF['reduced_4_3.Term'],
# '1'=reduced_4_3_col_VIF['reduced_4_3.VIF'],
# '2'=reduced_4_4_lev_col_VIF['reduced_4_4_lev.VIF'],
# '3'=reduced_4_5_DDFFITS_col_VIF['reduced_4_5_DDFFITS.VIF'],
# '4'=reduced_4_6_no_special_col_VIF['reduced_4_6_no_special.VIF'])
# colnames(VIF_summary) <- c('Predictor Variable','4_3.VIF.Outliers','4_4_lev.VIF','4_5_DDFFITS.VIF','4_6_no_special.VIF')
# VIF_summary
## VIF for Outliers
### cat_quality~volatile.acidity+citric.acid+residual.sugar+total.sulfur.dioxide+density+free.sulfur.dioxide+pH+sulphates+alcohol+Type
#
reg_4_VIF_test <- vif(train_temp[c(1,2,3,4,7,8,6,9,10,11,12)])
reg_4_2_VIF_test <- vif(train_temp[c(2,3,4,7,8,6,9,10,11,12)])
outliers_VIF <- vif(train_no_outliers[c(2,3,4,7,8,6,9,10,11,12)])
leverage_VIF <- vif(train_no_leverages[c(2,3,4,7,8,6,9,10,11,12)])
DDFFITS_VIF <- vif(train_no_DDFFITS[c(2,3,4,7,8,6,9,10,11,12)])
nothing_special <- vif(train_nothing_special[c(2,3,4,7,8,6,9,10,11,12)])
reg_4_VIF_test
fixed.acidity volatile.acidity citric.acid residual.sugar
4.962487 2.139008 1.585795 9.280982
total.sulfur.dioxide density free.sulfur.dioxide pH
3.974170 21.338984 2.189173 2.483275
sulphates alcohol Type
1.514014 5.371286 7.045601
VIF_summary_test <- data.frame('best_possible_VIF (post)'=reg_4_2_VIF_test,
'outliers_VIF'=outliers_VIF,
'leverage_VIF'=leverage_VIF,
'DDFFITS_VIF'= DDFFITS_VIF,
'nothing_special'=nothing_special)
VIF_summary_test
NA
##evaluating model
Reduced4_3_AIC_train <- reduced_4_3$aic
##predicted quality for test data based on training data
preds<-predict(reduced_4_3,newdata=test, type="response")
reduced_4_3_error <- table(test$cat_quality, preds>0.6)
evulation_summary_4_3 <- data.frame(
attempt = 'reduced_4_3_error_outliers',
AIC = Reduced4_3_AIC_train,
PRESS = get_press(reduced_4_3),
'False positive' = round(reduced_4_3_error[3]/(reduced_4_3_error[1]+reduced_4_3_error[3]),3),
'False negative' = round(reduced_4_3_error[2]/(reduced_4_3_error[2]+reduced_4_3_error[4]),3),
'Error Rate' = round((reduced_4_3_error[2]+reduced_4_3_error[3])/(reduced_4_3_error[1]+reduced_4_3_error[2]+reduced_4_3_error[3]+reduced_4_3_error[4]),3)
)
evulation_summary <- rbind(evulation_summary,evulation_summary_4_3)
evulation_summary
##evaluating model leverage
reduced_4_4_lev_AIC_train <- reduced_4_4_lev$aic
##predicted quality for test data based on training data
preds<-predict(reduced_4_4_lev,newdata=test, type="response")
reduced_4_4_lev_error <- table(test$cat_quality, preds>0.65)
evulation_summary_4_4_lev <- data.frame(
attempt = 'reduced_4_4_lev_error',
AIC = reduced_4_4_lev_AIC_train,
PRESS = get_press(reduced_4_4_lev),
'False positive' = round(reduced_4_4_lev_error[3]/(reduced_4_4_lev_error[1]+reduced_4_4_lev_error[3]),3),
'False negative' = round(reduced_4_4_lev_error[2]/(reduced_4_4_lev_error[2]+reduced_4_4_lev_error[4]),3),
'Error Rate' = round((reduced_4_4_lev_error[2]+reduced_4_4_lev_error[3])/(reduced_4_4_lev_error[1]+reduced_4_4_lev_error[2]+reduced_4_4_lev_error[3]+reduced_4_4_lev_error[4]),3)
)
evulation_summary <- rbind(evulation_summary,evulation_summary_4_4_lev)
evulation_summary
##evaluating model DDFFITS
reduced_4_5_DDFFITS_AIC_train <- reduced_4_5_DDFFITS$aic
##predicted quality for test data based on training data
preds<-predict(reduced_4_5_DDFFITS,newdata=test, type="response")
reduced_4_5_DDFFITS_error <- table(test$cat_quality, preds>0.7)
evulation_summary_4_5_DDFFITS <- data.frame(
attempt = 'reduced_4_5_DDFFITS_error',
AIC = reduced_4_5_DDFFITS_AIC_train,
PRESS = get_press(reduced_4_5_DDFFITS),
'False positive' = round(reduced_4_5_DDFFITS_error[3]/(reduced_4_5_DDFFITS_error[1]+reduced_4_5_DDFFITS_error[3]),3),
'False negative' = round(reduced_4_5_DDFFITS_error[2]/(reduced_4_5_DDFFITS_error[2]+reduced_4_5_DDFFITS_error[4]),3),
'Error Rate' = round((reduced_4_5_DDFFITS_error[2]+reduced_4_5_DDFFITS_error[3])/(reduced_4_5_DDFFITS_error[1]+reduced_4_5_DDFFITS_error[2]+reduced_4_5_DDFFITS_error[3]+reduced_4_5_DDFFITS_error[4]),3)
)
evulation_summary <- rbind(evulation_summary,evulation_summary_4_5_DDFFITS)
evulation_summary
##evaluating model DDFFITS
reduced_4_6_no_special_AIC_train <- reduced_4_6_no_special$aic
##predicted quality for test data based on training data
preds<-predict(reduced_4_6_no_special,newdata=test, type="response")
reduced_4_6_no_special_error <- table(test$cat_quality, preds>0.8)
evulation_summary_4_6_no_special <- data.frame(
attempt = 'reduced_4_6_no_special_error',
AIC = reduced_4_6_no_special_AIC_train,
PRESS = get_press(reduced_4_6_no_special),
'False positive' = round(reduced_4_6_no_special_error[3]/(reduced_4_6_no_special_error[1]+reduced_4_6_no_special_error[3]),3),
'False negative' = round(reduced_4_6_no_special_error[2]/(reduced_4_6_no_special_error[2]+reduced_4_6_no_special_error[4]),3),
'Error Rate' = round((reduced_4_6_no_special_error[2]+reduced_4_6_no_special_error[3])/(reduced_4_6_no_special_error[1]+reduced_4_6_no_special_error[2]+reduced_4_6_no_special_error[3]+reduced_4_6_no_special_error[4]),3)
)
evulation_summary <- rbind(evulation_summary,evulation_summary_4_6_no_special)
evulation_summary
## reduced_1
# detach(package:performance, unload=TRUE)
## FYI the performance package causes ROC curves to not work
library(ROCR)
# reduced_1
preds<-predict(reduced_1,newdata=test, type="response")
rates<-prediction(preds, test$cat_quality)
roc_result<-performance(rates,measure="tpr", x.measure="fpr")
plot(roc_result, main="ROC Curve for reduced_1")
lines(x = c(0,1), y = c(0,1), col="red")
auc<-performance(rates, measure = "auc")
reduced_1_auc <- auc@y.values
## reduced_4
preds<-predict(reduced_4,newdata=test, type="response")
rates4<-prediction(preds, test$cat_quality)
roc_result<-performance(rates4,measure="tpr", x.measure="fpr")
plot(roc_result, main="ROC Curve for reduced_4")
lines(x = c(0,1), y = c(0,1), col="red")
auc4<-performance(rates4, measure = "auc")
reduced_4_auc <- auc4@y.values
## reduced_4_2
preds<-predict(reduced_4_2,newdata=test, type="response")
rates<-prediction(preds, test$cat_quality)
roc_result<-performance(rates,measure="tpr", x.measure="fpr")
plot(roc_result, main="ROC Curve for reduced_4_2")
lines(x = c(0,1), y = c(0,1), col="red")
auc4_2<-performance(rates, measure = "auc")
reduced_4_2_auc <- auc4_2@y.values
## reduced_4_3
preds<-predict(reduced_4_3,newdata=test, type="response")
rates<-prediction(preds, test$cat_quality)
roc_result<-performance(rates,measure="tpr", x.measure="fpr")
plot(roc_result, main="ROC Curve for reduced_4_3")
lines(x = c(0,1), y = c(0,1), col="red")
auc<-performance(rates, measure = "auc")
reduced_4_3_auc <- auc@y.values
## reduced_4_4_lev
preds<-predict(reduced_4_4_lev,newdata=test, type="response")
rates<-prediction(preds, test$cat_quality)
roc_result<-performance(rates,measure="tpr", x.measure="fpr")
plot(roc_result, main="ROC Curve for reduced_4_4_lev")
lines(x = c(0,1), y = c(0,1), col="red")
auc<-performance(rates, measure = "auc")
reduced_4_4_lev_auc <- auc@y.values
## reduced_4_5_DDFFITS
preds<-predict(reduced_4_5_DDFFITS,newdata=test, type="response")
rates<-prediction(preds, test$cat_quality)
roc_result<-performance(rates,measure="tpr", x.measure="fpr")
plot(roc_result, main="ROC Curve for reduced_4_5_DDFFITS")
lines(x = c(0,1), y = c(0,1), col="red")
auc<-performance(rates, measure = "auc")
reduced_4_5_DDFFITS_auc <- auc@y.values
## reduced_4_6_no_special
preds<-predict(reduced_4_6_no_special,newdata=test, type="response")
rates<-prediction(preds, test$cat_quality)
roc_result<-performance(rates,measure="tpr", x.measure="fpr")
plot(roc_result, main="ROC Curve for reduced_4_6_no_special")
lines(x = c(0,1), y = c(0,1), col="red")
auc<-performance(rates, measure = "auc")
reduced_4_6_no_special_auc <- auc@y.values
AUC_summary <- data.frame('reduced_1'=reduced_1_auc,
'reduced_4'=reduced_4_auc,
'reduced_4_2'=reduced_4_2_auc,
'reduced_4_3'=reduced_4_3_auc,
'reduced_4_4_lev'=reduced_4_4_lev_auc,
'reduced_4_5_DDFFITS'=reduced_4_5_DDFFITS_auc,
'reduced_4_6_no_special'=reduced_4_6_no_special_auc)
colnames(AUC_summary) <- c('reduced_1','reduced_4','reduced_4_2','reduced_4_3','reduced_4_4_lev','reduced_4_5_DDFFITS','reduced_4_6_no_special')
AUC_summary
regfull_Red<-glm(cat_quality~., family="binomial", data=train_Red_NoType)
regnull_Red<-glm(cat_quality~1, family="binomial", data=train_Red_NoType)
step(regnull_Red, scope=list(lower=regnull_Red, upper=regfull_Red), direction="forward")
Start: AIC=1811.89
cat_quality ~ 1
Df Deviance AIC
+ alcohol 1 1540.6 1544.6
+ volatile.acidity 1 1680.6 1684.6
+ total.sulfur.dioxide 1 1726.9 1730.9
+ sulphates 1 1741.1 1745.1
+ citric.acid 1 1778.7 1782.7
+ density 1 1780.0 1784.0
+ chlorides 1 1795.4 1799.4
+ fixed.acidity 1 1797.8 1801.8
+ free.sulfur.dioxide 1 1803.8 1807.8
<none> 1809.9 1811.9
+ pH 1 1809.8 1813.8
+ residual.sugar 1 1809.9 1813.9
Step: AIC=1544.61
cat_quality ~ alcohol
Df Deviance AIC
+ volatile.acidity 1 1450.7 1456.7
+ sulphates 1 1492.7 1498.7
+ total.sulfur.dioxide 1 1507.4 1513.4
+ fixed.acidity 1 1521.6 1527.6
+ citric.acid 1 1523.5 1529.5
+ pH 1 1528.7 1534.7
+ density 1 1535.8 1541.8
<none> 1540.6 1544.6
+ free.sulfur.dioxide 1 1539.2 1545.2
+ residual.sugar 1 1539.8 1545.8
+ chlorides 1 1540.5 1546.5
Step: AIC=1456.66
cat_quality ~ alcohol + volatile.acidity
Df Deviance AIC
+ total.sulfur.dioxide 1 1417.1 1425.1
+ sulphates 1 1427.5 1435.5
+ fixed.acidity 1 1448.1 1456.1
+ free.sulfur.dioxide 1 1448.1 1456.1
<none> 1450.7 1456.7
+ citric.acid 1 1449.0 1457.0
+ density 1 1449.5 1457.5
+ residual.sugar 1 1449.8 1457.8
+ pH 1 1449.9 1457.9
+ chlorides 1 1450.7 1458.7
Step: AIC=1425.14
cat_quality ~ alcohol + volatile.acidity + total.sulfur.dioxide
Df Deviance AIC
+ sulphates 1 1387.4 1397.4
+ free.sulfur.dioxide 1 1409.8 1419.8
<none> 1417.1 1425.1
+ pH 1 1415.8 1425.8
+ density 1 1416.1 1426.1
+ fixed.acidity 1 1416.2 1426.2
+ citric.acid 1 1416.8 1426.8
+ residual.sugar 1 1417.0 1427.0
+ chlorides 1 1417.1 1427.1
Step: AIC=1397.37
cat_quality ~ alcohol + volatile.acidity + total.sulfur.dioxide +
sulphates
Df Deviance AIC
+ free.sulfur.dioxide 1 1379.6 1391.6
+ chlorides 1 1380.0 1392.0
+ citric.acid 1 1383.9 1395.9
<none> 1387.4 1397.4
+ residual.sugar 1 1387.1 1399.1
+ fixed.acidity 1 1387.3 1399.3
+ pH 1 1387.4 1399.4
+ density 1 1387.4 1399.4
Step: AIC=1391.64
cat_quality ~ alcohol + volatile.acidity + total.sulfur.dioxide +
sulphates + free.sulfur.dioxide
Df Deviance AIC
+ chlorides 1 1372.6 1386.6
<none> 1379.6 1391.6
+ citric.acid 1 1377.9 1391.9
+ pH 1 1379.3 1393.3
+ fixed.acidity 1 1379.3 1393.3
+ residual.sugar 1 1379.5 1393.5
+ density 1 1379.6 1393.6
Step: AIC=1386.63
cat_quality ~ alcohol + volatile.acidity + total.sulfur.dioxide +
sulphates + free.sulfur.dioxide + chlorides
Df Deviance AIC
<none> 1372.6 1386.6
+ pH 1 1371.2 1387.2
+ citric.acid 1 1372.1 1388.1
+ fixed.acidity 1 1372.2 1388.2
+ residual.sugar 1 1372.2 1388.2
+ density 1 1372.6 1388.6
Call: glm(formula = cat_quality ~ alcohol + volatile.acidity + total.sulfur.dioxide +
sulphates + free.sulfur.dioxide + chlorides, family = "binomial",
data = train_Red_NoType)
Coefficients:
(Intercept) alcohol volatile.acidity total.sulfur.dioxide
-8.02155 0.85512 -2.93115 -0.01843
sulphates free.sulfur.dioxide chlorides
2.63739 0.02326 -4.10141
Degrees of Freedom: 1308 Total (i.e. Null); 1302 Residual
Null Deviance: 1810
Residual Deviance: 1373 AIC: 1387
The model looks great after the foward selection! Time to test and add to the evaluation summary.
model1_Red<-glm(formula = cat_quality ~ alcohol + volatile.acidity + total.sulfur.dioxide +
sulphates + free.sulfur.dioxide + chlorides, family = "binomial",
data = train_Red_NoType)
summary(model1_Red)
Call:
glm(formula = cat_quality ~ alcohol + volatile.acidity + total.sulfur.dioxide +
sulphates + free.sulfur.dioxide + chlorides, family = "binomial",
data = train_Red_NoType)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.0655 -0.8628 0.3239 0.8474 2.3082
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -8.021553 0.886912 -9.044 < 2e-16 ***
alcohol 0.855123 0.078409 10.906 < 2e-16 ***
volatile.acidity -2.931148 0.413395 -7.090 1.34e-12 ***
total.sulfur.dioxide -0.018428 0.002926 -6.297 3.03e-10 ***
sulphates 2.637391 0.456973 5.771 7.86e-09 ***
free.sulfur.dioxide 0.023257 0.008608 2.702 0.00689 **
chlorides -4.101411 1.588464 -2.582 0.00982 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1809.9 on 1308 degrees of freedom
Residual deviance: 1372.6 on 1302 degrees of freedom
AIC: 1386.6
Number of Fisher Scoring iterations: 4
##evaluating model
model1_Red_AIC_train <- model1_Red$aic
##predicted quality for test data based on training data
test_Red_NoType<-subset(test, Type == 0, select=-c(Type))
preds<-predict(model1_Red,newdata=test_Red_NoType, type="response")
model1_Red_error <- table(test_Red_NoType$cat_quality, preds>0.7)
#Curves
evulation_summary_1R <- data.frame(
attempt = 'model1_Red',
AIC = model1_Red_AIC_train,
PRESS = get_press(model1_Red),
'False positive' = round(model1_Red_error[3]/(model1_Red_error[1]+model1_Red_error[3]),3),
'False negative' = round(model1_Red_error[2]/(model1_Red_error[2]+model1_Red_error[4]),3),
'Error Rate' = round((model1_Red_error[2]+model1_Red_error[3])/(model1_Red_error[1]+model1_Red_error[2]+model1_Red_error[3]+model1_Red_error[4]),3)
)
compare_models<-rbind(evulation_summary[1,],evulation_summary_1R)
compare_models
evulation_summary <- rbind(evulation_summary,evulation_summary_1R)
evulation_summary
NA
# model1_Red
library(ROCR)
preds<-predict(model1_Red,newdata=test, type="response")
rates<-prediction(preds, test$cat_quality)
roc_result<-performance(rates,measure="tpr", x.measure="fpr")
plot(roc_result, main="ROC Curve for model1_Red")
lines(x = c(0,1), y = c(0,1), col="red")
auc<-performance(rates, measure = "auc")
model1_Red_auc <- auc@y.values
regfull_White<-glm(cat_quality~., family="binomial", data=train_White_NoType)
regnull_White<-glm(cat_quality~1,family="binomial", data=train_White_NoType)
step(regnull_White, scope=list(lower=regnull_White, upper=regfull_White), direction="forward")
Start: AIC=4947.36
cat_quality ~ 1
Df Deviance AIC
+ alcohol 1 4280.3 4284.3
+ density 1 4668.2 4672.2
+ volatile.acidity 1 4758.7 4762.7
+ chlorides 1 4805.4 4809.4
+ total.sulfur.dioxide 1 4831.9 4835.9
+ fixed.acidity 1 4912.0 4916.0
+ pH 1 4914.3 4918.3
+ residual.sugar 1 4919.1 4923.1
+ sulphates 1 4934.7 4938.7
<none> 4945.4 4947.4
+ free.sulfur.dioxide 1 4945.2 4949.2
+ citric.acid 1 4945.4 4949.4
Step: AIC=4284.29
cat_quality ~ alcohol
Df Deviance AIC
+ volatile.acidity 1 4030.1 4036.1
+ residual.sugar 1 4214.8 4220.8
+ free.sulfur.dioxide 1 4235.1 4241.1
+ density 1 4253.7 4259.7
+ sulphates 1 4261.3 4267.3
+ fixed.acidity 1 4267.3 4273.3
+ chlorides 1 4273.1 4279.1
+ pH 1 4276.2 4282.2
+ citric.acid 1 4276.3 4282.3
<none> 4280.3 4284.3
+ total.sulfur.dioxide 1 4279.4 4285.4
Step: AIC=4036.1
cat_quality ~ alcohol + volatile.acidity
Df Deviance AIC
+ residual.sugar 1 3942.1 3950.1
+ density 1 3984.6 3992.6
+ free.sulfur.dioxide 1 3999.3 4007.3
+ sulphates 1 4013.2 4021.2
+ fixed.acidity 1 4014.8 4022.8
+ total.sulfur.dioxide 1 4020.2 4028.2
<none> 4030.1 4036.1
+ pH 1 4028.3 4036.3
+ chlorides 1 4028.6 4036.6
+ citric.acid 1 4030.1 4038.1
Step: AIC=3950.11
cat_quality ~ alcohol + volatile.acidity + residual.sugar
Df Deviance AIC
+ fixed.acidity 1 3923.0 3933.0
+ sulphates 1 3924.2 3934.2
+ free.sulfur.dioxide 1 3930.2 3940.2
+ density 1 3932.9 3942.9
+ pH 1 3934.3 3944.3
<none> 3942.1 3950.1
+ total.sulfur.dioxide 1 3941.1 3951.1
+ citric.acid 1 3941.6 3951.6
+ chlorides 1 3942.1 3952.1
Step: AIC=3932.95
cat_quality ~ alcohol + volatile.acidity + residual.sugar + fixed.acidity
Df Deviance AIC
+ sulphates 1 3904.7 3916.7
+ free.sulfur.dioxide 1 3914.0 3926.0
<none> 3923.0 3933.0
+ pH 1 3921.5 3933.5
+ total.sulfur.dioxide 1 3921.7 3933.7
+ density 1 3921.9 3933.9
+ citric.acid 1 3922.7 3934.7
+ chlorides 1 3923.0 3935.0
Step: AIC=3916.74
cat_quality ~ alcohol + volatile.acidity + residual.sugar + fixed.acidity +
sulphates
Df Deviance AIC
+ free.sulfur.dioxide 1 3896.9 3910.9
+ density 1 3898.2 3912.2
<none> 3904.7 3916.7
+ pH 1 3904.6 3918.6
+ total.sulfur.dioxide 1 3904.6 3918.6
+ citric.acid 1 3904.7 3918.7
+ chlorides 1 3904.7 3918.7
Step: AIC=3910.91
cat_quality ~ alcohol + volatile.acidity + residual.sugar + fixed.acidity +
sulphates + free.sulfur.dioxide
Df Deviance AIC
+ density 1 3890.7 3906.7
+ total.sulfur.dioxide 1 3894.7 3910.7
<none> 3896.9 3910.9
+ pH 1 3896.8 3912.8
+ chlorides 1 3896.9 3912.9
+ citric.acid 1 3896.9 3912.9
Step: AIC=3906.7
cat_quality ~ alcohol + volatile.acidity + residual.sugar + fixed.acidity +
sulphates + free.sulfur.dioxide + density
Df Deviance AIC
+ pH 1 3885.2 3903.2
<none> 3890.7 3906.7
+ total.sulfur.dioxide 1 3889.6 3907.6
+ chlorides 1 3890.7 3908.7
+ citric.acid 1 3890.7 3908.7
Step: AIC=3903.24
cat_quality ~ alcohol + volatile.acidity + residual.sugar + fixed.acidity +
sulphates + free.sulfur.dioxide + density + pH
Df Deviance AIC
<none> 3885.2 3903.2
+ total.sulfur.dioxide 1 3884.3 3904.3
+ chlorides 1 3885.0 3905.0
+ citric.acid 1 3885.1 3905.1
Call: glm(formula = cat_quality ~ alcohol + volatile.acidity + residual.sugar +
fixed.acidity + sulphates + free.sulfur.dioxide + density +
pH, family = "binomial", data = train_White_NoType)
Coefficients:
(Intercept) alcohol volatile.acidity residual.sugar
2.041e+02 8.487e-01 -6.416e+00 1.563e-01
fixed.acidity sulphates free.sulfur.dioxide density
-9.407e-03 1.877e+00 6.778e-03 -2.165e+02
pH
8.920e-01
Degrees of Freedom: 3887 Total (i.e. Null); 3879 Residual
Null Deviance: 4945
Residual Deviance: 3885 AIC: 3903
The model looks good after the foward selection, but the predictor fixed.acidity can be removed. The density VIF is above ten, but jsut barely. For now, it will be left in. Time to test and add to the evaluation summary.
model1_White<-glm(formula = cat_quality ~ alcohol + volatile.acidity + residual.sugar +
fixed.acidity + sulphates + free.sulfur.dioxide + density +
pH, family = "binomial", data = train_White_NoType)
summary(model1_White)
Call:
glm(formula = cat_quality ~ alcohol + volatile.acidity + residual.sugar +
fixed.acidity + sulphates + free.sulfur.dioxide + density +
pH, family = "binomial", data = train_White_NoType)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.198 -0.888 0.437 0.798 2.507
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 2.041e+02 6.993e+01 2.918 0.00352 **
alcohol 8.487e-01 9.686e-02 8.763 < 2e-16 ***
volatile.acidity -6.416e+00 4.477e-01 -14.329 < 2e-16 ***
residual.sugar 1.563e-01 2.746e-02 5.689 1.27e-08 ***
fixed.acidity -9.407e-03 7.582e-02 -0.124 0.90126
sulphates 1.877e+00 4.028e-01 4.661 3.15e-06 ***
free.sulfur.dioxide 6.778e-03 2.540e-03 2.669 0.00761 **
density -2.165e+02 7.088e+01 -3.054 0.00226 **
pH 8.920e-01 3.886e-01 2.295 0.02172 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 4945.4 on 3887 degrees of freedom
Residual deviance: 3885.2 on 3879 degrees of freedom
AIC: 3903.2
Number of Fisher Scoring iterations: 5
model1_White<-glm(formula = cat_quality ~ alcohol + volatile.acidity + residual.sugar + sulphates + free.sulfur.dioxide + density +
pH, family = "binomial", data = train_White_NoType)
summary(model1_White)
Call:
glm(formula = cat_quality ~ alcohol + volatile.acidity + residual.sugar +
sulphates + free.sulfur.dioxide + density + pH, family = "binomial",
data = train_White_NoType)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.1956 -0.8867 0.4377 0.7961 2.5070
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 2.105e+02 4.762e+01 4.420 9.87e-06 ***
alcohol 8.408e-01 7.311e-02 11.501 < 2e-16 ***
volatile.acidity -6.407e+00 4.417e-01 -14.504 < 2e-16 ***
residual.sugar 1.586e-01 1.972e-02 8.045 8.66e-16 ***
sulphates 1.885e+00 3.978e-01 4.739 2.14e-06 ***
free.sulfur.dioxide 6.796e-03 2.536e-03 2.680 0.00737 **
density -2.230e+02 4.773e+01 -4.673 2.97e-06 ***
pH 9.247e-01 2.859e-01 3.235 0.00122 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 4945.4 on 3887 degrees of freedom
Residual deviance: 3885.3 on 3880 degrees of freedom
AIC: 3901.3
Number of Fisher Scoring iterations: 5
##evaluating model
model1_White_AIC_train <- model1_White$aic
##predicted quality for test data based on training data
test_White_NoType<-subset(test, Type == 1, select=-c(Type))
preds<-predict(model1_White,newdata=test_White_NoType, type="response")
model1_White_error <- table(test_White_NoType$cat_quality, preds>0.7)
#Curves
evulation_summary_1W <- data.frame(
attempt = 'model1_White',
AIC = model1_White_AIC_train,
PRESS = get_press(model1_White),
'False positive' = round(model1_White_error[3]/(model1_White_error[1]+model1_White_error[3]),3),
'False negative' = round(model1_White_error[2]/(model1_White_error[2]+model1_White_error[4]),3),
'Error Rate' = round((model1_White_error[2]+model1_White_error[3])/(model1_White_error[1]+model1_White_error[2]+model1_White_error[3]+model1_White_error[4]),3)
)
evulation_summary <- rbind(evulation_summary,evulation_summary_1W)
evulation_summary
compare_models<-rbind(compare_models,evulation_summary_1W)
compare_models
NA
regfull_int<-glm(cat_quality~.*Type, family="binomial", data=train)
regnull_int<-glm(cat_quality~1,family="binomial", data=train)
step(regnull_int, scope=list(lower=regnull_int, upper=regfull_int), direction="forward")
Start: AIC=6835.15
cat_quality ~ 1
Df Deviance AIC
+ alcohol 1 5904.3 5908.3
+ density 1 6450.6 6454.6
+ volatile.acidity 1 6468.6 6472.6
+ chlorides 1 6640.0 6644.0
+ Type 1 6755.3 6759.3
+ citric.acid 1 6801.3 6805.3
+ fixed.acidity 1 6807.6 6811.6
+ free.sulfur.dioxide 1 6822.7 6826.7
+ total.sulfur.dioxide 1 6823.5 6827.5
+ sulphates 1 6827.8 6831.8
+ residual.sugar 1 6830.8 6834.8
<none> 6833.2 6835.2
+ pH 1 6831.4 6835.4
Step: AIC=5908.3
cat_quality ~ alcohol
Df Deviance AIC
+ volatile.acidity 1 5539.7 5545.7
+ residual.sugar 1 5781.4 5787.4
+ free.sulfur.dioxide 1 5810.8 5816.8
+ Type 1 5826.6 5832.6
+ chlorides 1 5861.3 5867.3
+ citric.acid 1 5864.8 5870.8
+ total.sulfur.dioxide 1 5872.3 5878.3
+ fixed.acidity 1 5893.0 5899.0
+ pH 1 5895.5 5901.5
+ sulphates 1 5897.6 5903.6
<none> 5904.3 5908.3
+ density 1 5903.5 5909.5
Step: AIC=5545.67
cat_quality ~ alcohol + volatile.acidity
Df Deviance AIC
+ density 1 5476.5 5484.5
+ sulphates 1 5477.1 5485.1
+ residual.sugar 1 5500.5 5508.5
+ Type 1 5504.4 5512.4
+ total.sulfur.dioxide 1 5524.4 5532.4
+ pH 1 5532.9 5540.9
+ free.sulfur.dioxide 1 5533.7 5541.7
<none> 5539.7 5545.7
+ chlorides 1 5537.8 5545.8
+ citric.acid 1 5537.9 5545.9
+ fixed.acidity 1 5538.6 5546.6
Step: AIC=5484.51
cat_quality ~ alcohol + volatile.acidity + density
Df Deviance AIC
+ sulphates 1 5442.1 5452.1
+ citric.acid 1 5463.1 5473.1
+ fixed.acidity 1 5463.8 5473.8
+ total.sulfur.dioxide 1 5464.0 5474.0
+ Type 1 5465.2 5475.2
+ pH 1 5471.0 5481.0
+ free.sulfur.dioxide 1 5471.2 5481.2
+ residual.sugar 1 5472.8 5482.8
<none> 5476.5 5484.5
+ chlorides 1 5476.3 5486.3
Step: AIC=5452.14
cat_quality ~ alcohol + volatile.acidity + density + sulphates
Df Deviance AIC
+ residual.sugar 1 5419.3 5431.3
+ fixed.acidity 1 5422.1 5434.1
+ citric.acid 1 5424.1 5436.1
+ free.sulfur.dioxide 1 5432.7 5444.7
+ total.sulfur.dioxide 1 5435.6 5447.6
+ pH 1 5439.3 5451.3
+ chlorides 1 5439.4 5451.4
<none> 5442.1 5452.1
+ Type 1 5440.5 5452.5
Step: AIC=5431.3
cat_quality ~ alcohol + volatile.acidity + density + sulphates +
residual.sugar
Df Deviance AIC
+ Type 1 5379.2 5393.2
+ total.sulfur.dioxide 1 5385.9 5399.9
+ citric.acid 1 5404.8 5418.8
+ pH 1 5410.7 5424.7
+ fixed.acidity 1 5415.4 5429.4
<none> 5419.3 5431.3
+ free.sulfur.dioxide 1 5417.5 5431.5
+ chlorides 1 5418.8 5432.8
Step: AIC=5393.24
cat_quality ~ alcohol + volatile.acidity + density + sulphates +
residual.sugar + Type
Df Deviance AIC
+ volatile.acidity:Type 1 5348.7 5364.7
+ total.sulfur.dioxide 1 5371.4 5387.4
+ pH 1 5371.9 5387.9
+ citric.acid 1 5372.7 5388.7
+ free.sulfur.dioxide 1 5373.9 5389.9
+ residual.sugar:Type 1 5374.6 5390.6
+ chlorides 1 5376.4 5392.4
+ density:Type 1 5377.1 5393.1
<none> 5379.2 5393.2
+ fixed.acidity 1 5378.4 5394.4
+ alcohol:Type 1 5379.2 5395.2
+ sulphates:Type 1 5379.2 5395.2
Step: AIC=5364.67
cat_quality ~ alcohol + volatile.acidity + density + sulphates +
residual.sugar + Type + volatile.acidity:Type
Df Deviance AIC
+ total.sulfur.dioxide 1 5342.0 5360.0
+ residual.sugar:Type 1 5343.3 5361.3
+ free.sulfur.dioxide 1 5344.5 5362.5
+ pH 1 5344.7 5362.7
+ density:Type 1 5345.2 5363.2
+ chlorides 1 5345.8 5363.8
+ citric.acid 1 5346.2 5364.2
<none> 5348.7 5364.7
+ alcohol:Type 1 5348.3 5366.3
+ sulphates:Type 1 5348.5 5366.5
+ fixed.acidity 1 5348.6 5366.6
Step: AIC=5359.98
cat_quality ~ alcohol + volatile.acidity + density + sulphates +
residual.sugar + Type + total.sulfur.dioxide + volatile.acidity:Type
Df Deviance AIC
+ total.sulfur.dioxide:Type 1 5303.1 5323.1
+ free.sulfur.dioxide 1 5323.1 5343.1
+ residual.sugar:Type 1 5337.0 5357.0
+ pH 1 5337.3 5357.3
+ chlorides 1 5338.9 5358.9
+ density:Type 1 5339.4 5359.4
+ citric.acid 1 5339.9 5359.9
<none> 5342.0 5360.0
+ alcohol:Type 1 5341.7 5361.7
+ fixed.acidity 1 5341.8 5361.8
+ sulphates:Type 1 5341.9 5361.9
Step: AIC=5323.07
cat_quality ~ alcohol + volatile.acidity + density + sulphates +
residual.sugar + Type + total.sulfur.dioxide + volatile.acidity:Type +
Type:total.sulfur.dioxide
Df Deviance AIC
+ free.sulfur.dioxide 1 5286.9 5308.9
+ density:Type 1 5295.6 5317.6
+ pH 1 5299.6 5321.6
+ alcohol:Type 1 5300.2 5322.2
+ chlorides 1 5300.3 5322.3
<none> 5303.1 5323.1
+ citric.acid 1 5302.1 5324.1
+ sulphates:Type 1 5302.2 5324.2
+ residual.sugar:Type 1 5302.4 5324.4
+ fixed.acidity 1 5302.9 5324.9
Step: AIC=5308.91
cat_quality ~ alcohol + volatile.acidity + density + sulphates +
residual.sugar + Type + total.sulfur.dioxide + free.sulfur.dioxide +
volatile.acidity:Type + Type:total.sulfur.dioxide
Df Deviance AIC
+ density:Type 1 5281.0 5305.0
+ chlorides 1 5283.6 5307.6
+ alcohol:Type 1 5283.8 5307.8
+ pH 1 5284.1 5308.1
<none> 5286.9 5308.9
+ free.sulfur.dioxide:Type 1 5285.6 5309.6
+ citric.acid 1 5285.8 5309.8
+ residual.sugar:Type 1 5286.2 5310.2
+ sulphates:Type 1 5286.3 5310.3
+ fixed.acidity 1 5286.8 5310.8
Step: AIC=5305.04
cat_quality ~ alcohol + volatile.acidity + density + sulphates +
residual.sugar + Type + total.sulfur.dioxide + free.sulfur.dioxide +
volatile.acidity:Type + Type:total.sulfur.dioxide + density:Type
Df Deviance AIC
+ pH 1 5275.6 5301.6
+ residual.sugar:Type 1 5276.7 5302.7
+ chlorides 1 5277.1 5303.1
<none> 5281.0 5305.0
+ citric.acid 1 5279.1 5305.1
+ free.sulfur.dioxide:Type 1 5279.2 5305.2
+ fixed.acidity 1 5280.0 5306.0
+ alcohol:Type 1 5280.6 5306.6
+ sulphates:Type 1 5280.9 5306.9
Step: AIC=5301.61
cat_quality ~ alcohol + volatile.acidity + density + sulphates +
residual.sugar + Type + total.sulfur.dioxide + free.sulfur.dioxide +
pH + volatile.acidity:Type + Type:total.sulfur.dioxide +
density:Type
Df Deviance AIC
+ residual.sugar:Type 1 5270.3 5298.3
+ pH:Type 1 5271.9 5299.9
+ chlorides 1 5272.9 5300.9
<none> 5275.6 5301.6
+ free.sulfur.dioxide:Type 1 5274.3 5302.3
+ fixed.acidity 1 5275.0 5303.0
+ citric.acid 1 5275.1 5303.1
+ sulphates:Type 1 5275.2 5303.2
+ alcohol:Type 1 5275.4 5303.4
Step: AIC=5298.31
cat_quality ~ alcohol + volatile.acidity + density + sulphates +
residual.sugar + Type + total.sulfur.dioxide + free.sulfur.dioxide +
pH + volatile.acidity:Type + Type:total.sulfur.dioxide +
density:Type + residual.sugar:Type
Df Deviance AIC
+ pH:Type 1 5266.7 5296.7
+ chlorides 1 5268.0 5298.0
<none> 5270.3 5298.3
+ free.sulfur.dioxide:Type 1 5268.4 5298.4
+ fixed.acidity 1 5269.7 5299.7
+ citric.acid 1 5269.9 5299.9
+ sulphates:Type 1 5270.1 5300.1
+ alcohol:Type 1 5270.2 5300.2
Step: AIC=5296.67
cat_quality ~ alcohol + volatile.acidity + density + sulphates +
residual.sugar + Type + total.sulfur.dioxide + free.sulfur.dioxide +
pH + volatile.acidity:Type + Type:total.sulfur.dioxide +
density:Type + residual.sugar:Type + Type:pH
Df Deviance AIC
+ chlorides 1 5263.4 5295.4
+ free.sulfur.dioxide:Type 1 5263.8 5295.8
<none> 5266.7 5296.7
+ citric.acid 1 5265.9 5297.9
+ alcohol:Type 1 5266.4 5298.4
+ sulphates:Type 1 5266.6 5298.6
+ fixed.acidity 1 5266.6 5298.6
Step: AIC=5295.44
cat_quality ~ alcohol + volatile.acidity + density + sulphates +
residual.sugar + Type + total.sulfur.dioxide + free.sulfur.dioxide +
pH + chlorides + volatile.acidity:Type + Type:total.sulfur.dioxide +
density:Type + residual.sugar:Type + Type:pH
Df Deviance AIC
+ chlorides:Type 1 5259.3 5293.3
+ free.sulfur.dioxide:Type 1 5260.5 5294.5
<none> 5263.4 5295.4
+ sulphates:Type 1 5262.7 5296.7
+ citric.acid 1 5263.0 5297.0
+ alcohol:Type 1 5263.2 5297.2
+ fixed.acidity 1 5263.4 5297.4
Step: AIC=5293.31
cat_quality ~ alcohol + volatile.acidity + density + sulphates +
residual.sugar + Type + total.sulfur.dioxide + free.sulfur.dioxide +
pH + chlorides + volatile.acidity:Type + Type:total.sulfur.dioxide +
density:Type + residual.sugar:Type + Type:pH + Type:chlorides
Df Deviance AIC
+ free.sulfur.dioxide:Type 1 5256.2 5292.2
<none> 5259.3 5293.3
+ sulphates:Type 1 5257.9 5293.9
+ citric.acid 1 5258.9 5294.9
+ alcohol:Type 1 5259.3 5295.3
+ fixed.acidity 1 5259.3 5295.3
Step: AIC=5292.16
cat_quality ~ alcohol + volatile.acidity + density + sulphates +
residual.sugar + Type + total.sulfur.dioxide + free.sulfur.dioxide +
pH + chlorides + volatile.acidity:Type + Type:total.sulfur.dioxide +
density:Type + residual.sugar:Type + Type:pH + Type:chlorides +
Type:free.sulfur.dioxide
Df Deviance AIC
<none> 5256.2 5292.2
+ sulphates:Type 1 5254.7 5292.7
+ citric.acid 1 5255.9 5293.9
+ alcohol:Type 1 5256.1 5294.1
+ fixed.acidity 1 5256.2 5294.2
Call: glm(formula = cat_quality ~ alcohol + volatile.acidity + density +
sulphates + residual.sugar + Type + total.sulfur.dioxide +
free.sulfur.dioxide + pH + chlorides + volatile.acidity:Type +
Type:total.sulfur.dioxide + density:Type + residual.sugar:Type +
Type:pH + Type:chlorides + Type:free.sulfur.dioxide, family = "binomial",
data = train)
Coefficients:
(Intercept) alcohol volatile.acidity
9.68713 0.84990 -2.84978
density sulphates residual.sugar
-15.58322 2.24908 0.03117
Type total.sulfur.dioxide free.sulfur.dioxide
195.20143 -0.01898 0.02478
pH chlorides volatile.acidity:Type
-0.60200 -4.02470 -3.50882
Type:total.sulfur.dioxide density:Type residual.sugar:Type
0.01756 -201.96697 0.12808
Type:pH Type:chlorides Type:free.sulfur.dioxide
1.50862 5.10775 -0.01620
Degrees of Freedom: 5196 Total (i.e. Null); 5179 Residual
Null Deviance: 6833
Residual Deviance: 5256 AIC: 5292
The forward step process dropped + sulphates:Type, fixed.acidity, alcohol:Type, and citric.acid By the hierarchical principle, the two non-interactive terms need to be added back because their have interaction terms are in the model.
model1_int<-glm(formula = cat_quality ~ alcohol + volatile.acidity + density +
sulphates + residual.sugar + Type + total.sulfur.dioxide +
free.sulfur.dioxide + pH + chlorides + volatile.acidity:Type +
Type:total.sulfur.dioxide + density:Type + residual.sugar:Type +
Type:pH + Type:chlorides + Type:free.sulfur.dioxide, family = "binomial",
data = train)
summary(model1_int)
Call:
glm(formula = cat_quality ~ alcohol + volatile.acidity + density +
sulphates + residual.sugar + Type + total.sulfur.dioxide +
free.sulfur.dioxide + pH + chlorides + volatile.acidity:Type +
Type:total.sulfur.dioxide + density:Type + residual.sugar:Type +
Type:pH + Type:chlorides + Type:free.sulfur.dioxide, family = "binomial",
data = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.2378 -0.8774 0.4148 0.8069 2.4950
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 9.687e+00 4.539e+01 0.213 0.830982
alcohol 8.499e-01 5.665e-02 15.002 < 2e-16 ***
volatile.acidity -2.850e+00 4.243e-01 -6.716 1.87e-11 ***
density -1.558e+01 4.495e+01 -0.347 0.728824
sulphates 2.249e+00 3.032e-01 7.418 1.19e-13 ***
residual.sugar 3.117e-02 5.020e-02 0.621 0.534702
Type 1.952e+02 5.575e+01 3.501 0.000463 ***
total.sulfur.dioxide -1.897e-02 2.917e-03 -6.505 7.77e-11 ***
free.sulfur.dioxide 2.478e-02 8.602e-03 2.880 0.003971 **
pH -6.020e-01 4.860e-01 -1.239 0.215502
chlorides -4.025e+00 1.527e+00 -2.636 0.008391 **
volatile.acidity:Type -3.509e+00 6.200e-01 -5.659 1.52e-08 ***
Type:total.sulfur.dioxide 1.756e-02 3.200e-03 5.487 4.09e-08 ***
density:Type -2.020e+02 5.584e+01 -3.617 0.000299 ***
residual.sugar:Type 1.281e-01 5.214e-02 2.456 0.014036 *
Type:pH 1.509e+00 5.610e-01 2.689 0.007159 **
Type:chlorides 5.108e+00 2.438e+00 2.095 0.036133 *
Type:free.sulfur.dioxide -1.620e-02 9.147e-03 -1.771 0.076642 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 6833.2 on 5196 degrees of freedom
Residual deviance: 5256.2 on 5179 degrees of freedom
AIC: 5292.2
Number of Fisher Scoring iterations: 5
##evaluating model
model1_int_AIC_train <- model1_int$aic
##predicted quality for test data based on training data
preds<-predict(model1_int,newdata=test, type="response")
model1_int_error <- table(test$cat_quality, preds>0.7)
#Curves
evulation_summary_1int <- data.frame(
attempt = 'model1_int',
AIC = model1_int_AIC_train,
PRESS = get_press(model1_int),
'False positive' = round(model1_int_error[3]/(model1_int_error[1]+model1_int_error[3]),3),
'False negative' = round(model1_int_error[2]/(model1_int_error[2]+model1_int_error[4]),3),
'Error Rate' = round((model1_int_error[2]+model1_int_error[3])/(model1_int_error[1]+model1_int_error[2]+model1_int_error[3]+model1_int_error[4]),3)
)
evulation_summary <- rbind(evulation_summary,evulation_summary_1int)
evulation_summary
compare_models<-rbind(compare_models,evulation_summary_1int)
compare_models
NA
compare_models<-compare_models%>%
rename(
Model = attempt
)
library(data.table)
Registered S3 method overwritten by 'data.table':
method from
print.data.table
data.table 1.14.0 using 1 threads (see ?getDTthreads). Latest news: r-datatable.com
**********
This installation of data.table has not detected OpenMP support. It should still work but in single-threaded mode.
This is a Mac. Please read https://mac.r-project.org/openmp/. Please engage with Apple and ask them for support. Check r-datatable.com for updates, and our Mac instructions here: https://github.com/Rdatatable/data.table/wiki/Installation. After several years of many reports of installation problems on Mac, it's time to gingerly point out that there have been no similar problems on Windows or Linux.
**********
Attaching package: ‘data.table’
The following objects are masked from ‘package:reshape2’:
dcast, melt
The following objects are masked from ‘package:dplyr’:
between, first, last
The following object is masked from ‘package:purrr’:
transpose
library(dplyr)
library(formattable)
Registered S3 methods overwritten by 'htmltools':
method from
print.html tools:rstudio
print.shiny.tag tools:rstudio
print.shiny.tag.list tools:rstudio
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
library(tidyr)
customGreen0 = "#DeF7E9"
customGreen = "#71CA97"
customRed = "#ff7f7f"
Creating the ROC curves and AUC for the 3 new models.
# model1_Red
preds<-predict(model1_Red,newdata=test, type="response")
rates<-prediction(preds, test$cat_quality)
roc_result<-performance(rates,measure="tpr", x.measure="fpr")
plot(roc_result, main="ROC Curve for model1_Red")
lines(x = c(0,1), y = c(0,1), col="red")
auc<-performance(rates, measure = "auc")
model1_Red_auc <- auc@y.values
# model1_White
preds<-predict(model1_White,newdata=test, type="response")
rates<-prediction(preds, test$cat_quality)
roc_result<-performance(rates,measure="tpr", x.measure="fpr")
plot(roc_result, main="ROC Curve for model1_White")
lines(x = c(0,1), y = c(0,1), col="red")
auc<-performance(rates, measure = "auc")
model1_White_auc <- auc@y.values
# model1_int
preds<-predict(model1_int,newdata=test, type="response")
rates<-prediction(preds, test$cat_quality)
roc_result<-performance(rates,measure="tpr", x.measure="fpr")
plot(roc_result, main="ROC Curve for model1_int")
lines(x = c(0,1), y = c(0,1), col="red")
auc<-performance(rates, measure = "auc")
model1_int_auc <- auc@y.values
reduced_1_auc
[[1]]
[1] 0.7949768
model1_Red_auc
[[1]]
[1] 0.7541583
model1_White_auc
[[1]]
[1] 0.7752713
model1_int_auc
[[1]]
[1] 0.8015216
##create heat map Consolidated
ggplot(data = melted_cor_train, aes(x=Var1, y=Var2, fill=value)) +
geom_tile(color = "white")+
scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limit = c(-1,1), space = "Lab", name="Pearson\nCorrelation") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1))+
coord_fixed()+
labs(title = 'Consolidated (Both Red and White)')
##create heat map White
ggplot(data = melted_cor_train_white, aes(x=Var1, y=Var2, fill=value)) +
geom_tile(color = "white")+
scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limit = c(-1,1), space = "Lab", name="Pearson\nCorrelation") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1))+
coord_fixed()+
labs(title = 'White Wine')
##create heat map Red
ggplot(data = melted_cor_train_Red, aes(x=Var1, y=Var2, fill=value)) +
geom_tile(color = "white")+
scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limit = c(-1,1), space = "Lab", name="Pearson\nCorrelation") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1))+
coord_fixed()+
labs(title = 'Red Wine')
ggplot(train_with_qual, mapping = aes(x=quality, fill=Type))+
geom_histogram(binwidth=1, alpha=.4, position="identity", color="black")+
geom_vline(aes(xintercept=5.5, color="red"),
linetype="dashed")+
scale_color_manual(name = "Cut Off", values = c("red"))+
labs(x="Quality",
y="Frequency",
title="Distribution of Quality Rating by Wine Type")
This is the table for showing the evaluation for the first model
formattable(evulation_summary[1,])
summary(model1_Red)
Call:
glm(formula = cat_quality ~ alcohol + volatile.acidity + total.sulfur.dioxide +
sulphates + free.sulfur.dioxide + chlorides, family = "binomial",
data = train_Red_NoType)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.0655 -0.8628 0.3239 0.8474 2.3082
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -8.021553 0.886912 -9.044 < 2e-16 ***
alcohol 0.855123 0.078409 10.906 < 2e-16 ***
volatile.acidity -2.931148 0.413395 -7.090 1.34e-12 ***
total.sulfur.dioxide -0.018428 0.002926 -6.297 3.03e-10 ***
sulphates 2.637391 0.456973 5.771 7.86e-09 ***
free.sulfur.dioxide 0.023257 0.008608 2.702 0.00689 **
chlorides -4.101411 1.588464 -2.582 0.00982 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1809.9 on 1308 degrees of freedom
Residual deviance: 1372.6 on 1302 degrees of freedom
AIC: 1386.6
Number of Fisher Scoring iterations: 4
summary(model1_White)
Call:
glm(formula = cat_quality ~ alcohol + volatile.acidity + residual.sugar +
sulphates + free.sulfur.dioxide + density + pH, family = "binomial",
data = train_White_NoType)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.1956 -0.8867 0.4377 0.7961 2.5070
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 2.105e+02 4.762e+01 4.420 9.87e-06 ***
alcohol 8.408e-01 7.311e-02 11.501 < 2e-16 ***
volatile.acidity -6.407e+00 4.417e-01 -14.504 < 2e-16 ***
residual.sugar 1.586e-01 1.972e-02 8.045 8.66e-16 ***
sulphates 1.885e+00 3.978e-01 4.739 2.14e-06 ***
free.sulfur.dioxide 6.796e-03 2.536e-03 2.680 0.00737 **
density -2.230e+02 4.773e+01 -4.673 2.97e-06 ***
pH 9.247e-01 2.859e-01 3.235 0.00122 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 4945.4 on 3887 degrees of freedom
Residual deviance: 3885.3 on 3880 degrees of freedom
AIC: 3901.3
Number of Fisher Scoring iterations: 5
summary(model1_int)
Call:
glm(formula = cat_quality ~ alcohol + volatile.acidity + density +
sulphates + residual.sugar + Type + total.sulfur.dioxide +
free.sulfur.dioxide + pH + chlorides + volatile.acidity:Type +
Type:total.sulfur.dioxide + density:Type + residual.sugar:Type +
Type:pH + Type:chlorides + Type:free.sulfur.dioxide, family = "binomial",
data = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.2378 -0.8774 0.4148 0.8069 2.4950
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 9.687e+00 4.539e+01 0.213 0.830982
alcohol 8.499e-01 5.665e-02 15.002 < 2e-16 ***
volatile.acidity -2.850e+00 4.243e-01 -6.716 1.87e-11 ***
density -1.558e+01 4.495e+01 -0.347 0.728824
sulphates 2.249e+00 3.032e-01 7.418 1.19e-13 ***
residual.sugar 3.117e-02 5.020e-02 0.621 0.534702
Type 1.952e+02 5.575e+01 3.501 0.000463 ***
total.sulfur.dioxide -1.897e-02 2.917e-03 -6.505 7.77e-11 ***
free.sulfur.dioxide 2.478e-02 8.602e-03 2.880 0.003971 **
pH -6.020e-01 4.860e-01 -1.239 0.215502
chlorides -4.025e+00 1.527e+00 -2.636 0.008391 **
volatile.acidity:Type -3.509e+00 6.200e-01 -5.659 1.52e-08 ***
Type:total.sulfur.dioxide 1.756e-02 3.200e-03 5.487 4.09e-08 ***
density:Type -2.020e+02 5.584e+01 -3.617 0.000299 ***
residual.sugar:Type 1.281e-01 5.214e-02 2.456 0.014036 *
Type:pH 1.509e+00 5.610e-01 2.689 0.007159 **
Type:chlorides 5.108e+00 2.438e+00 2.095 0.036133 *
Type:free.sulfur.dioxide -1.620e-02 9.147e-03 -1.771 0.076642 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 6833.2 on 5196 degrees of freedom
Residual deviance: 5256.2 on 5179 degrees of freedom
AIC: 5292.2
Number of Fisher Scoring iterations: 5
Table right above the “Best Possible Model (Reduced_4)” section.
formattable(compare_models,align =c("l","c", "c", "c", "c", "r"))
NA
This is the table for showing the best models (top five)
formattable(res.best.logistic$BestModels)
Add ROC for reduced_1, model1_Red, model1_White, model1_int
preds<-predict(reduced_1,newdata=test, type="response")
rates<-prediction(preds, test$cat_quality)
roc_result<-performance(rates,measure="tpr", x.measure="fpr")
plot(roc_result, main="ROC Curve for reduced_1")
lines(x = c(0,1), y = c(0,1), col="red")
auc<-performance(rates, measure = "auc")
reduced_1_auc <- auc@y.values
# model1_Red
preds<-predict(model1_Red,newdata=test, type="response")
rates<-prediction(preds, test$cat_quality)
roc_result<-performance(rates,measure="tpr", x.measure="fpr")
plot(roc_result, main="ROC Curve for model1_Red")
lines(x = c(0,1), y = c(0,1), col="red")
auc<-performance(rates, measure = "auc")
model1_Red_auc <- auc@y.values
# model1_White
preds<-predict(model1_White,newdata=test, type="response")
rates<-prediction(preds, test$cat_quality)
roc_result<-performance(rates,measure="tpr", x.measure="fpr")
plot(roc_result, main="ROC Curve for model1_White")
lines(x = c(0,1), y = c(0,1), col="red")
auc<-performance(rates, measure = "auc")
model1_White_auc <- auc@y.values
# model1_int
preds<-predict(model1_int,newdata=test, type="response")
rates<-prediction(preds, test$cat_quality)
roc_result<-performance(rates,measure="tpr", x.measure="fpr")
plot(roc_result, main="ROC Curve for model1_int")
lines(x = c(0,1), y = c(0,1), col="red")
auc<-performance(rates, measure = "auc")
model1_int_auc <- auc@y.values
This is the table for showing the best models (top five)
formattable(evulation_summary[2,])
This is the table for reduced_4 VIF.
formattable(data.frame(reg_4_VIF_test), align =c("l","r"))
This is the next VIF plot in the report
formattable(data.frame(reg_4_2_VIF_test), align =c("l","r"))
The table below that. It is the evaluation summary for reduced_4_2
formattable(evulation_summary[3,])
evaluation summary for the outlier/leverage/etc.
formattable(evulation_summary[4:7,])
add roc curves for these four.
## reduced_4
preds<-predict(reduced_4,newdata=test, type="response")
rates4<-prediction(preds, test$cat_quality)
roc_result<-performance(rates4,measure="tpr", x.measure="fpr")
plot(roc_result, main="ROC Curve for reduced_4")
lines(x = c(0,1), y = c(0,1), col="red")
auc4<-performance(rates4, measure = "auc")
reduced_4_auc <- auc4@y.values
## reduced_4_2
preds<-predict(reduced_4_2,newdata=test, type="response")
rates<-prediction(preds, test$cat_quality)
roc_result<-performance(rates,measure="tpr", x.measure="fpr")
plot(roc_result, main="ROC Curve for reduced_4_2")
lines(x = c(0,1), y = c(0,1), col="red")
auc4_2<-performance(rates, measure = "auc")
reduced_4_2_auc <- auc4_2@y.values
## reduced_4_3
preds<-predict(reduced_4_3,newdata=test, type="response")
rates<-prediction(preds, test$cat_quality)
roc_result<-performance(rates,measure="tpr", x.measure="fpr")
plot(roc_result, main="ROC Curve for reduced_4_3")
lines(x = c(0,1), y = c(0,1), col="red")
auc<-performance(rates, measure = "auc")
reduced_4_3_auc <- auc@y.values
## reduced_4_4_lev
preds<-predict(reduced_4_4_lev,newdata=test, type="response")
rates<-prediction(preds, test$cat_quality)
roc_result<-performance(rates,measure="tpr", x.measure="fpr")
plot(roc_result, main="ROC Curve for reduced_4_4_lev")
lines(x = c(0,1), y = c(0,1), col="red")
auc<-performance(rates, measure = "auc")
reduced_4_4_lev_auc <- auc@y.values
## reduced_4_5_DDFFITS
preds<-predict(reduced_4_5_DDFFITS,newdata=test, type="response")
rates<-prediction(preds, test$cat_quality)
roc_result<-performance(rates,measure="tpr", x.measure="fpr")
plot(roc_result, main="ROC Curve for reduced_4_5_DDFFITS")
lines(x = c(0,1), y = c(0,1), col="red")
auc<-performance(rates, measure = "auc")
reduced_4_5_DDFFITS_auc <- auc@y.values
## reduced_4_6_no_special
preds<-predict(reduced_4_6_no_special,newdata=test, type="response")
rates<-prediction(preds, test$cat_quality)
roc_result<-performance(rates,measure="tpr", x.measure="fpr")
plot(roc_result, main="ROC Curve for reduced_4_6_no_special")
lines(x = c(0,1), y = c(0,1), col="red")
auc<-performance(rates, measure = "auc")
reduced_4_6_no_special_auc <- auc@y.values
AUC_summary <- data.frame('reduced_1'=reduced_1_auc,
'reduced_4'=reduced_4_auc,
'reduced_4_2'=reduced_4_2_auc,
'reduced_4_3'=reduced_4_3_auc,
'reduced_4_4_lev'=reduced_4_4_lev_auc,
'reduced_4_5_DDFFITS'=reduced_4_5_DDFFITS_auc,
'reduced_4_6_no_special'=reduced_4_6_no_special_auc)
colnames(AUC_summary) <- c('reduced_1','reduced_4','reduced_4_2','reduced_4_3','reduced_4_4_lev','reduced_4_5_DDFFITS','reduced_4_6_no_special')
AUC_summary
formattable(evulation_summary[1:7,])
The first research question focused on exploring the relationships between different chemical features in the dataset and wine type (categorized as either red or white). Since the response variable, wine type, was a Bernoulli random variable and the observations were independent, a logistic regression was used for the predictive model.
As seen above in the previous section, exploratory data analysis was performed in order to better understand the relationships between individual predictor variables and wine type. Density plots visualized the quantitative predictors, while frequency bar charts visualized the categorical wine quality predictor. The majority of these features were distributed differently across wine type, indicating that they could be used to differentiate between red and white wine in the model. However, as seen in Figure XXX, alcohol, pH, and sulphates do seem to have a weaker association with wine type since their red and white density plots look somewhat similar.
After exploring these features, an initial model was generated with all of the predictors using the random training sample from the combined wine dataset. From the model summary, the chemical properties of fixed acidity, citric acid, pH, and sulphates all appeared to have insignificant z-scores. To determine if all of these features could be dropped from the model, a likelihood ratio test comparing the full and reduced model was conducted. This test resulted in an insignificant p-value of 0.1360. Therefore, there was not a significant difference between the two models and the reduced model could be adopted. This model’s performance is summarized below. XXX
Call:
glm(formula = Type ~ ., family = "binomial", data = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-5.7661 -0.0003 0.0133 0.0465 4.9615
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.525e+03 1.963e+02 7.768 7.98e-15 ***
fixed.acidity 1.287e-01 2.621e-01 0.491 0.62344
volatile.acidity -7.722e+00 1.226e+00 -6.300 2.98e-10 ***
citric.acid 1.608e+00 1.295e+00 1.242 0.21428
residual.sugar 8.972e-01 1.113e-01 8.061 7.58e-16 ***
chlorides -2.674e+01 5.116e+00 -5.227 1.73e-07 ***
free.sulfur.dioxide -4.604e-02 1.679e-02 -2.742 0.00611 **
total.sulfur.dioxide 5.215e-02 5.436e-03 9.594 < 2e-16 ***
density -1.518e+03 2.007e+02 -7.564 3.89e-14 ***
pH 3.214e-03 1.620e+00 0.002 0.99842
sulphates -4.389e+00 1.383e+00 -3.174 0.00150 **
alcohol -1.188e+00 2.915e-01 -4.077 4.57e-05 ***
quality -4.123e-01 2.294e-01 -1.797 0.07235 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 5866.24 on 5196 degrees of freedom
Residual deviance: 337.27 on 5184 degrees of freedom
AIC: 363.27
Number of Fisher Scoring iterations: 10
[[1]]
[1] 0.9946774
FALSE TRUE
Red 285 5
White 3 1007
Call:
glm(formula = Type ~ . - quality - pH - fixed.acidity - citric.acid,
family = "binomial", data = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-5.8141 -0.0002 0.0125 0.0439 4.4116
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.430e+03 1.226e+02 11.664 < 2e-16 ***
volatile.acidity -8.193e+00 1.007e+00 -8.139 4.00e-16 ***
residual.sugar 8.752e-01 9.663e-02 9.058 < 2e-16 ***
chlorides -2.542e+01 4.340e+00 -5.857 4.71e-09 ***
free.sulfur.dioxide -5.730e-02 1.525e-02 -3.757 0.000172 ***
total.sulfur.dioxide 5.588e-02 5.210e-03 10.725 < 2e-16 ***
density -1.423e+03 1.219e+02 -11.672 < 2e-16 ***
sulphates -5.093e+00 1.360e+00 -3.744 0.000181 ***
alcohol -1.225e+00 2.323e-01 -5.274 1.34e-07 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 5866.24 on 5196 degrees of freedom
Residual deviance: 344.26 on 5188 degrees of freedom
AIC: 362.26
Number of Fisher Scoring iterations: 9
The ROC curve XXX, seen below, indicates that the model performed well (significantly better than random guessing shown by the red line). Furthermore, the AUC, which represents the area under this curve, was 0.994. These performance metrics suggest that the model is quite effective at predicting the wine type of the training data. When this model predicted the wine type of the testing data (the sample leftover from the training set), it maintained this high performance. We used a probability floor of 0.65 since this value minimized overall error (we saw no practical difference between type I and type II errors in this context). Using this value, the model had an overall error rate of only 0.0062. This result demonstrates the effectiveness of the model when presented with data it had not encountered in training.
FALSE TRUE
Red 285 5
White 3 1007
To further improve the model’s performance, the presence of outliers in the training data was considered. Using Cook’s distance, one outlier (with a distance of 3.0858) was identified. However, after refitting a model without this outlier, there was a very small difference in overall performance as seen below XXX. Thus, we elected to keep the outlier in the data—especially, since we had little insight into the data and could not deduce whether it was a corrupted datapoint.
Call:
glm(formula = Type ~ . - quality - sulphates - citric.acid, family = "binomial",
data = new_data)
Deviance Residuals:
Min 1Q Median 3Q Max
-5.6180 -0.0002 0.0167 0.0427 4.5510
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 2.468e+03 2.139e+02 11.534 < 2e-16 ***
fixed.acidity 1.046e+00 2.562e-01 4.083 4.44e-05 ***
volatile.acidity -6.460e+00 1.050e+00 -6.155 7.51e-10 ***
residual.sugar 1.016e+00 1.039e-01 9.783 < 2e-16 ***
chlorides -2.449e+01 4.777e+00 -5.126 2.96e-07 ***
free.sulfur.dioxide -5.566e-02 1.710e-02 -3.255 0.00113 **
total.sulfur.dioxide 5.676e-02 5.716e-03 9.930 < 2e-16 ***
density -2.476e+03 2.177e+02 -11.374 < 2e-16 ***
pH 3.867e+00 1.647e+00 2.347 0.01890 *
alcohol -2.650e+00 3.304e-01 -8.021 1.05e-15 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 5865.66 on 5195 degrees of freedom
Residual deviance: 286.49 on 5186 degrees of freedom
AIC: 306.49
Number of Fisher Scoring iterations: 9
FALSE TRUE
Red 285 5
White 2 1008
Finally, collinearity among the predictors was examined in the reduced model. Using the VIF formula, density was very correlated with the other predictors (having a VIF value of 13.714). Since this value was over 10, this parameter was dropped in order to reduce collinearity and by extension the predicted coefficients’ variances. In addition, the correlation matrix of the predictors showed a strong association between free sulfur dioxide and total sulfur dioxide (correlation of 0.7156). Since one of these features could explain much of the variation in the other, it was determined one should be dropped. When comparing models with only one of these features, it was evident that total sulfur dioxide was more important in predicting wine type than free sulfur dioxide. Therefore, free sulfur dioxide was dropped from the model. The performance of this new reduced model can be seen below XXX.
fixed.acidity volatile.acidity residual.sugar chlorides
4.534992 1.648202 6.411491 1.539680
free.sulfur.dioxide total.sulfur.dioxide density pH
2.106400 2.801104 13.714057 2.502181
alcohol
4.153380
fixed.acidity volatile.acidity residual.sugar chlorides free.sulfur.dioxide
fixed.acidity 1.00000000 0.21475470 -0.1143578 0.30907486 -0.28545891
volatile.acidity 0.21475470 1.00000000 -0.1971152 0.38572698 -0.36296634
residual.sugar -0.11435784 -0.19711519 1.0000000 -0.13267263 0.40674512
chlorides 0.30907486 0.38572698 -0.1326726 1.00000000 -0.20583891
free.sulfur.dioxide -0.28545891 -0.36296634 0.4067451 -0.20583891 1.00000000
total.sulfur.dioxide -0.32829667 -0.42380894 0.4945946 -0.29216595 0.71557904
density 0.46102677 0.27340816 0.5468642 0.36564000 0.01712365
pH -0.25053297 0.26430327 -0.2656167 0.04191555 -0.15788076
alcohol -0.09179407 -0.03401873 -0.3484276 -0.25617404 -0.17708863
total.sulfur.dioxide density pH alcohol
fixed.acidity -0.3282967 0.46102677 -0.25053297 -0.09179407
volatile.acidity -0.4238089 0.27340816 0.26430327 -0.03401873
residual.sugar 0.4945946 0.54686423 -0.26561668 -0.34842765
chlorides -0.2921660 0.36564000 0.04191555 -0.25617404
free.sulfur.dioxide 0.7155790 0.01712365 -0.15788076 -0.17708863
total.sulfur.dioxide 1.0000000 0.02373810 -0.24648617 -0.26082628
density 0.0237381 1.00000000 0.01687243 -0.67847127
pH -0.2464862 0.01687243 1.00000000 0.11700658
alcohol -0.2608263 -0.67847127 0.11700658 1.00000000
glm.fit: fitted probabilities numerically 0 or 1 occurred
Call:
glm(formula = Type ~ . - quality - pH - fixed.acidity - citric.acid -
density - free.sulfur.dioxide, family = "binomial", data = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-6.1013 0.0000 0.0139 0.0522 3.0217
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.073845 1.177468 0.912 0.362
volatile.acidity -12.420039 0.793575 -15.651 < 2e-16 ***
residual.sugar 0.190465 0.047838 3.981 6.85e-05 ***
chlorides -36.881169 3.376694 -10.922 < 2e-16 ***
total.sulfur.dioxide 0.065168 0.003563 18.292 < 2e-16 ***
sulphates -12.241130 0.880881 -13.896 < 2e-16 ***
alcohol 0.726637 0.100961 7.197 6.14e-13 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 5866.24 on 5196 degrees of freedom
Residual deviance: 644.11 on 5190 degrees of freedom
AIC: 658.11
Number of Fisher Scoring iterations: 9
FALSE TRUE
Red 284 6
White 25 985
Though the model’s performance decreased slightly (overall error rate went from 0.0062 to 0.0297 and AUC went from 0.9947 to 0.9892), the decrease in variance of the coefficients should make the model more reliable in terms of interpreting the relationships between the predictors and wine type.
Overall, this final model is quite effective at determining wine type. When looking at the estimated coefficients, it seems as though higher residual sugar, total sulfur dioxide, and alcohol are associated with white wines, while higher volatile acidity, chlorides, and sulphates are associated with red wines. Together, these predictors explain much of the variation in the predictor variable, wine type (residual deviance of only 286.49).